机器学习基础系列——混合专家模型

前言

混合专家模型（Mixture of Experts, MoE）是一种稀疏激活的架构，通过将计算分配给不同的”专家”网络，在保持高效推理的同时扩展模型容量。MoE已成为现代大型语言模型（如GPT-4、Mixtral）的核心架构。

MoE核心概念

基本架构

MoE由两个核心组件组成：

专家网络（Experts）：多个独立的前馈网络
门控网络（Gating Network）：决定每个输入激活哪些专家

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 可视化MoE架构
def visualize_moe_architecture():
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # 输入
    ax.add_patch(plt.Rectangle((0.1, 0.4), 0.15, 0.2, fill=True, color='lightblue'))
    ax.text(0.175, 0.5, '输入\nx', ha='center', va='center', fontsize=10)
    
    # 门控网络
    ax.add_patch(plt.Rectangle((0.35, 0.7), 0.15, 0.15, fill=True, color='lightgreen'))
    ax.text(0.425, 0.775, '门控\nG(x)', ha='center', va='center', fontsize=9)
    
    # 专家网络
    experts_y = [0.15, 0.4, 0.65]
    for i, y in enumerate(experts_y):
        ax.add_patch(plt.Rectangle((0.55, y), 0.12, 0.12, fill=True, color='lightyellow'))
        ax.text(0.61, y+0.06, f'E{i+1}', ha='center', va='center', fontsize=10)
    
    # 输出
    ax.add_patch(plt.Rectangle((0.75, 0.4), 0.15, 0.2, fill=True, color='lightcoral'))
    ax.text(0.825, 0.5, '输出\ny', ha='center', va='center', fontsize=10)
    
    # 箭头
    ax.annotate('', xy=(0.35, 0.5), xytext=(0.25, 0.5),
                arrowprops=dict(arrowstyle='->', color='black'))
    ax.annotate('', xy=(0.35, 0.775), xytext=(0.25, 0.55),
                arrowprops=dict(arrowstyle='->', color='blue'))
    
    for y in experts_y:
        ax.annotate('', xy=(0.55, y+0.06), xytext=(0.25, 0.5),
                    arrowprops=dict(arrowstyle='->', color='gray', alpha=0.5))
        ax.annotate('', xy=(0.75, 0.5), xytext=(0.67, y+0.06),
                    arrowprops=dict(arrowstyle='->', color='gray', alpha=0.5))
    
    # 门控权重
    ax.annotate('', xy=(0.55, 0.71), xytext=(0.50, 0.775),
                arrowprops=dict(arrowstyle='->', color='green'))
    ax.text(0.52, 0.72, 'weights', fontsize=8, color='green')
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_aspect('equal')
    ax.axis('off')
    ax.set_title('Mixture of Experts 架构', fontsize=14)
    
    plt.tight_layout()
    plt.show()

visualize_moe_architecture()

数学公式

MoE的输出计算：

\[y = \sum_{i=1}^{N} G(x)_i \cdot E_i(x)\]

其中：

$G(x)$ 是门控网络输出的权重向量
$E_i(x)$ 是第 $i$ 个专家的输出
$N$ 是专家总数

从零实现MoE

简单MoE层

class Expert:
    """单个专家网络"""
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        self.W1 = np.random.randn(input_dim, hidden_dim) * 0.01
        self.b1 = np.zeros(hidden_dim)
        self.W2 = np.random.randn(hidden_dim, output_dim) * 0.01
        self.b2 = np.zeros(output_dim)
    
    def forward(self, x):
        # 简单的两层前馈网络
        h = np.maximum(0, x @ self.W1 + self.b1)  # ReLU
        return h @ self.W2 + self.b2


class GatingNetwork:
    """门控网络"""
    
    def __init__(self, input_dim, num_experts):
        self.W = np.random.randn(input_dim, num_experts) * 0.01
        self.b = np.zeros(num_experts)
    
    def forward(self, x):
        logits = x @ self.W + self.b
        # Softmax
        exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
        return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)


class MixtureOfExperts:
    """混合专家模型"""
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_experts):
        self.num_experts = num_experts
        self.experts = [Expert(input_dim, hidden_dim, output_dim) 
                       for _ in range(num_experts)]
        self.gating = GatingNetwork(input_dim, num_experts)
    
    def forward(self, x):
        # 获取门控权重
        gates = self.gating.forward(x)  # (batch, num_experts)
        
        # 计算所有专家的输出
        expert_outputs = np.stack([e.forward(x) for e in self.experts], axis=1)
        # (batch, num_experts, output_dim)
        
        # 加权求和
        output = np.sum(gates[:, :, np.newaxis] * expert_outputs, axis=1)
        return output, gates

# 测试
moe = MixtureOfExperts(input_dim=64, hidden_dim=128, output_dim=32, num_experts=4)

x = np.random.randn(8, 64)  # batch_size=8
output, gates = moe.forward(x)

print(f"输入形状: {x.shape}")
print(f"输出形状: {output.shape}")
print(f"门控权重形状: {gates.shape}")
print(f"门控权重示例:\n{gates[0]}")

稀疏MoE

Top-K门控

在实际应用中，MoE通常只激活少数专家（稀疏激活）：

class SparseMoE:
    """稀疏MoE - 只激活Top-K个专家"""
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_experts, top_k=2):
        self.num_experts = num_experts
        self.top_k = top_k
        self.experts = [Expert(input_dim, hidden_dim, output_dim) 
                       for _ in range(num_experts)]
        self.gating = GatingNetwork(input_dim, num_experts)
    
    def forward(self, x):
        batch_size = x.shape[0]
        
        # 获取门控logits
        gates = self.gating.forward(x)  # (batch, num_experts)
        
        # 选择Top-K专家
        top_k_indices = np.argsort(gates, axis=-1)[:, -self.top_k:]
        top_k_gates = np.take_along_axis(gates, top_k_indices, axis=-1)
        
        # 重新归一化Top-K权重
        top_k_gates = top_k_gates / np.sum(top_k_gates, axis=-1, keepdims=True)
        
        # 只计算被选中专家的输出
        output = np.zeros((batch_size, self.experts[0].W2.shape[1]))
        
        for i in range(batch_size):
            for j, expert_idx in enumerate(top_k_indices[i]):
                expert_output = self.experts[expert_idx].forward(x[i:i+1])
                output[i] += top_k_gates[i, j] * expert_output[0]
        
        return output, gates, top_k_indices

# 测试稀疏MoE
sparse_moe = SparseMoE(input_dim=64, hidden_dim=128, output_dim=32, 
                       num_experts=8, top_k=2)

output, gates, selected = sparse_moe.forward(x)
print(f"专家总数: 8, 激活专家数: 2")
print(f"选中的专家索引:\n{selected}")

可视化专家选择

def visualize_expert_selection(gates, top_k_indices, num_samples=5):
    """可视化专家选择"""
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # 门控权重热力图
    ax = axes[0]
    im = ax.imshow(gates[:num_samples], cmap='YlOrRd', aspect='auto')
    ax.set_xlabel('专家编号')
    ax.set_ylabel('样本编号')
    ax.set_title('门控权重分布')
    ax.set_xticks(range(gates.shape[1]))
    plt.colorbar(im, ax=ax)
    
    # 专家选择频率
    ax = axes[1]
    expert_counts = np.bincount(top_k_indices.flatten(), 
                                minlength=gates.shape[1])
    ax.bar(range(len(expert_counts)), expert_counts, color='steelblue')
    ax.set_xlabel('专家编号')
    ax.set_ylabel('被选中次数')
    ax.set_title('专家选择频率')
    
    plt.tight_layout()
    plt.show()

# 生成更多样本进行可视化
x_large = np.random.randn(100, 64)
_, gates_large, selected_large = sparse_moe.forward(x_large)
visualize_expert_selection(gates_large, selected_large)

负载均衡

辅助损失

为避免所有输入都路由到同一专家，需要添加负载均衡损失：

def load_balancing_loss(gates, top_k_indices, num_experts):
    """
    计算负载均衡损失
    
    目标：让每个专家被选中的频率大致相等
    """
    batch_size = gates.shape[0]
    
    # 计算每个专家被选中的频率
    expert_counts = np.zeros(num_experts)
    for idx in top_k_indices.flatten():
        expert_counts[idx] += 1
    
    # 理想情况下每个专家应该被选中的次数
    expected_count = batch_size * top_k_indices.shape[1] / num_experts
    
    # 计算负载均衡损失（方差）
    load_balance_loss = np.var(expert_counts)
    
    # 计算路由器z-loss（防止门控值过大）
    router_z_loss = np.mean(np.sum(gates ** 2, axis=-1))
    
    return load_balance_loss, router_z_loss, expert_counts

# 测试
lb_loss, z_loss, counts = load_balancing_loss(gates_large, selected_large, 8)
print(f"负载均衡损失: {lb_loss:.4f}")
print(f"Router Z-Loss: {z_loss:.4f}")
print(f"专家选择分布: {counts}")

Switch Transformer风格的损失

def switch_load_balancing_loss(gates, num_experts, capacity_factor=1.25):
    """
    Switch Transformer的负载均衡损失
    
    L_aux = α * N * Σ(f_i * P_i)
    其中 f_i 是专家i处理的token比例，P_i 是路由到专家i的概率
    """
    batch_size = gates.shape[0]
    
    # f_i: 每个专家实际处理的比例
    expert_assignments = np.argmax(gates, axis=-1)
    f = np.array([np.mean(expert_assignments == i) for i in range(num_experts)])
    
    # P_i: 路由概率的平均值
    P = np.mean(gates, axis=0)
    
    # 辅助损失
    aux_loss = num_experts * np.sum(f * P)
    
    return aux_loss, f, P

aux_loss, f, P = switch_load_balancing_loss(gates_large, 8)
print(f"Switch辅助损失: {aux_loss:.4f}")
print(f"实际分配比例 f: {f}")
print(f"路由概率 P: {P}")

PyTorch实现

完整的MoE层

try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    
    class ExpertLayer(nn.Module):
        """专家网络"""
        def __init__(self, input_dim, hidden_dim, output_dim):
            super().__init__()
            self.fc1 = nn.Linear(input_dim, hidden_dim)
            self.fc2 = nn.Linear(hidden_dim, output_dim)
            self.activation = nn.GELU()
        
        def forward(self, x):
            return self.fc2(self.activation(self.fc1(x)))
    
    
    class MoELayer(nn.Module):
        """Mixture of Experts层"""
        
        def __init__(self, input_dim, hidden_dim, output_dim, 
                     num_experts=8, top_k=2, noise_std=0.1):
            super().__init__()
            self.num_experts = num_experts
            self.top_k = top_k
            self.noise_std = noise_std
            
            # 专家网络
            self.experts = nn.ModuleList([
                ExpertLayer(input_dim, hidden_dim, output_dim)
                for _ in range(num_experts)
            ])
            
            # 门控网络
            self.gate = nn.Linear(input_dim, num_experts, bias=False)
        
        def forward(self, x):
            batch_size, seq_len, dim = x.shape
            x_flat = x.view(-1, dim)  # (batch*seq, dim)
            
            # 计算门控logits
            gate_logits = self.gate(x_flat)
            
            # 训练时添加噪声
            if self.training and self.noise_std > 0:
                noise = torch.randn_like(gate_logits) * self.noise_std
                gate_logits = gate_logits + noise
            
            # Top-K选择
            top_k_logits, top_k_indices = torch.topk(gate_logits, self.top_k, dim=-1)
            top_k_gates = F.softmax(top_k_logits, dim=-1)
            
            # 计算输出
            output = torch.zeros(x_flat.shape[0], self.experts[0].fc2.out_features,
                               device=x.device)
            
            for i, expert in enumerate(self.experts):
                # 找到选择了这个专家的样本
                expert_mask = (top_k_indices == i).any(dim=-1)
                if expert_mask.any():
                    expert_input = x_flat[expert_mask]
                    expert_output = expert(expert_input)
                    
                    # 获取对应的权重
                    weights = torch.where(top_k_indices[expert_mask] == i,
                                         top_k_gates[expert_mask],
                                         torch.zeros_like(top_k_gates[expert_mask]))
                    weights = weights.sum(dim=-1, keepdim=True)
                    
                    output[expert_mask] += weights * expert_output
            
            output = output.view(batch_size, seq_len, -1)
            
            # 计算辅助损失
            aux_loss = self._compute_aux_loss(gate_logits)
            
            return output, aux_loss
        
        def _compute_aux_loss(self, gate_logits):
            """计算负载均衡辅助损失"""
            gates = F.softmax(gate_logits, dim=-1)
            
            # 每个专家的平均门控值
            expert_probs = gates.mean(dim=0)
            
            # 每个专家被选为top-1的频率
            expert_assignments = torch.argmax(gates, dim=-1)
            expert_freq = torch.zeros(self.num_experts, device=gates.device)
            for i in range(self.num_experts):
                expert_freq[i] = (expert_assignments == i).float().mean()
            
            # 辅助损失
            aux_loss = self.num_experts * (expert_freq * expert_probs).sum()
            
            return aux_loss
    
    # 测试
    moe_layer = MoELayer(input_dim=512, hidden_dim=2048, output_dim=512,
                         num_experts=8, top_k=2)
    
    x = torch.randn(2, 10, 512)  # (batch, seq_len, dim)
    output, aux_loss = moe_layer(x)
    
    print(f"输入形状: {x.shape}")
    print(f"输出形状: {output.shape}")
    print(f"辅助损失: {aux_loss.item():.4f}")
    
except ImportError:
    print("PyTorch未安装")

MoE在Transformer中的应用

MoE-Transformer架构

try:
    class MoETransformerBlock(nn.Module):
        """带MoE的Transformer块"""
        
        def __init__(self, d_model, n_heads, num_experts=8, top_k=2):
            super().__init__()
            
            # 注意力层
            self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
            self.norm1 = nn.LayerNorm(d_model)
            
            # MoE FFN层（替代标准FFN）
            self.moe = MoELayer(
                input_dim=d_model,
                hidden_dim=d_model * 4,
                output_dim=d_model,
                num_experts=num_experts,
                top_k=top_k
            )
            self.norm2 = nn.LayerNorm(d_model)
        
        def forward(self, x, mask=None):
            # 自注意力
            attn_out, _ = self.attention(x, x, x, attn_mask=mask)
            x = self.norm1(x + attn_out)
            
            # MoE FFN
            moe_out, aux_loss = self.moe(x)
            x = self.norm2(x + moe_out)
            
            return x, aux_loss
    
    # 测试
    block = MoETransformerBlock(d_model=512, n_heads=8, num_experts=8, top_k=2)
    x = torch.randn(2, 20, 512)
    output, aux_loss = block(x)
    
    print(f"MoE-Transformer块测试:")
    print(f"  输入: {x.shape}")
    print(f"  输出: {output.shape}")
    print(f"  辅助损失: {aux_loss.item():.4f}")
    
except NameError:
    print("需要先定义MoELayer")

现代MoE模型

主要模型对比

模型	专家数	Top-K	总参数	激活参数
Switch Transformer	128	1	1.6T	25B
GLaM	64	2	1.2T	97B
Mixtral 8x7B	8	2	47B	13B
GPT-4 (推测)	~16	2	~1.8T	~220B

Mixtral架构特点

# Mixtral 8x7B 关键配置
mixtral_config = {
    'num_experts': 8,
    'experts_per_token': 2,  # top_k = 2
    'hidden_size': 4096,
    'intermediate_size': 14336,  # 每个专家的FFN维度
    'num_layers': 32,
    'num_attention_heads': 32,
    
    # 参数计算
    'total_params': '46.7B',  # 总参数
    'active_params': '12.9B',  # 每个token激活的参数
}

print("Mixtral 8x7B 配置:")
for k, v in mixtral_config.items():
    print(f"  {k}: {v}")

训练技巧

专家容量限制

def expert_capacity_routing(gates, capacity_factor=1.25, top_k=2):
    """
    带容量限制的专家路由
    
    capacity = (tokens_per_batch / num_experts) * capacity_factor * top_k
    """
    batch_size, num_experts = gates.shape
    
    # 计算每个专家的容量
    capacity = int((batch_size / num_experts) * capacity_factor * top_k)
    
    # 获取Top-K专家
    top_k_values, top_k_indices = np.argsort(gates, axis=-1)[:, -top_k:], \
                                   np.argsort(gates, axis=-1)[:, -top_k:]
    
    # 追踪每个专家的当前负载
    expert_load = np.zeros(num_experts, dtype=int)
    
    # 实际的路由决策
    final_routing = np.zeros((batch_size, num_experts))
    dropped_tokens = 0
    
    for i in range(batch_size):
        for j, expert_idx in enumerate(top_k_indices[i]):
            if expert_load[expert_idx] < capacity:
                final_routing[i, expert_idx] = gates[i, expert_idx]
                expert_load[expert_idx] += 1
            else:
                dropped_tokens += 1
    
    # 重新归一化
    row_sums = final_routing.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # 避免除零
    final_routing = final_routing / row_sums
    
    return final_routing, dropped_tokens, expert_load

# 测试
routing, dropped, loads = expert_capacity_routing(gates_large, capacity_factor=1.25)
print(f"丢弃的token数: {dropped}")
print(f"专家负载: {loads}")

常见问题

Q1: MoE相比密集模型有什么优势？

方面	MoE	密集模型
参数效率	高（稀疏激活）	低
训练成本	较低	较高
推理成本	取决于top_k	固定
专业化	专家可专注不同任务	统一处理

Q2: 如何解决专家崩溃问题？

添加负载均衡损失
使用噪声门控
专家容量限制
辅助路由损失

Q3: top_k如何选择？

top_k=1：最稀疏，但可能丢失信息
top_k=2：常用选择，平衡效率和质量
top_k>2：更好的质量，但效率下降

Q4: MoE训练有什么挑战？

负载不均衡
通信开销（分布式）
训练不稳定
专家利用率低

总结

概念	描述
专家网络	独立的前馈网络
门控网络	决定激活哪些专家
Top-K路由	只激活K个专家
负载均衡	确保专家被均匀使用
容量限制	防止单个专家过载

参考资料

Shazeer, N. et al. (2017). “Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer”
Fedus, W. et al. (2022). “Switch Transformers: Scaling to Trillion Parameter Models”
Jiang, A. et al. (2024). “Mixtral of Experts”
Lepikhin, D. et al. (2020). “GShard: Scaling Giant Models with Conditional Computation”

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——混合专家模型》

本文链接：http://localhost:3015/ai/%E6%B7%B7%E5%90%88%E4%B8%93%E5%AE%B6%E6%A8%A1%E5%9E%8B.html

本文最后一次更新为天前，文章中的某些内容可能已过时！