LLM应用开发——微调实战指南

前言

当预训练模型无法满足特定领域需求时，微调是提升模型性能的关键手段。本文详细介绍 LLM 微调的完整流程，包括数据准备、LoRA/QLoRA 参数高效微调、全量微调等技术。

微调概述

何时需要微调

┌─────────────────────────────────────────────────────────────────┐
│                    是否需要微调决策树                            │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  基础模型能完成任务吗？                                          │
│       │                                                         │
│       ├── 是 → Prompt Engineering 足够                         │
│       │                                                         │
│       └── 否 → RAG 能解决吗？                                  │
│                    │                                            │
│                    ├── 是 → 使用 RAG                           │
│                    │                                            │
│                    └── 否 → 需要微调                            │
│                              │                                  │
│                              ├── 数据量 < 1000 → Few-shot      │
│                              ├── 数据量 < 10000 → LoRA         │
│                              └── 数据量 > 10000 → Full FT      │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

微调方法对比

方法	参数量	显存需求	训练速度	效果
全量微调	100%	极高	慢	最好
LoRA	0.1-1%	低	快	很好
QLoRA	0.1-1%	极低	较快	好
Adapter	1-5%	中	中	好
Prefix Tuning	<1%	低	快	一般

数据准备

数据格式

# 指令微调数据格式
instruction_data = [
    {
        "instruction": "将以下文本翻译成英文",
        "input": "今天天气很好",
        "output": "The weather is nice today"
    },
    {
        "instruction": "总结以下文章的要点",
        "input": "人工智能（AI）是计算机科学的一个分支...",
        "output": "文章主要介绍了AI的定义、发展历程和应用领域"
    }
]

# 对话微调数据格式
conversation_data = [
    {
        "messages": [
            {"role": "system", "content": "你是一个专业的医疗助手"},
            {"role": "user", "content": "头疼怎么办？"},
            {"role": "assistant", "content": "头疼可能有多种原因..."}
        ]
    }
]

# 偏好数据格式（DPO/RLHF）
preference_data = [
    {
        "prompt": "如何学习编程？",
        "chosen": "学习编程建议从基础开始...",
        "rejected": "随便学学就行了"
    }
]

数据清洗与处理

import json
from typing import List, Dict
import re

class DataProcessor:
    """数据处理器"""
    
    def __init__(self, max_length: int = 2048):
        self.max_length = max_length
    
    def clean_text(self, text: str) -> str:
        """清洗文本"""
        # 移除多余空白
        text = re.sub(r'\s+', ' ', text)
        # 移除特殊字符
        text = re.sub(r'[^\w\s\u4e00-\u9fff.,!?，。！？]', '', text)
        return text.strip()
    
    def validate_sample(self, sample: dict) -> bool:
        """验证样本"""
        required_fields = ["instruction", "output"]
        
        for field in required_fields:
            if field not in sample or not sample[field]:
                return False
        
        # 长度检查
        total_length = len(sample.get("instruction", "")) + \
                       len(sample.get("input", "")) + \
                       len(sample.get("output", ""))
        
        if total_length > self.max_length * 4:  # 粗略估计
            return False
        
        return True
    
    def process_dataset(self, data: List[dict]) -> List[dict]:
        """处理数据集"""
        processed = []
        
        for sample in data:
            if not self.validate_sample(sample):
                continue
            
            processed_sample = {
                "instruction": self.clean_text(sample["instruction"]),
                "input": self.clean_text(sample.get("input", "")),
                "output": self.clean_text(sample["output"])
            }
            
            processed.append(processed_sample)
        
        return processed
    
    def convert_to_conversation(self, data: List[dict]) -> List[dict]:
        """转换为对话格式"""
        conversations = []
        
        for sample in data:
            messages = []
            
            # System message (optional)
            if sample.get("system"):
                messages.append({
                    "role": "system",
                    "content": sample["system"]
                })
            
            # User message
            user_content = sample["instruction"]
            if sample.get("input"):
                user_content += f"\n\n{sample['input']}"
            
            messages.append({
                "role": "user",
                "content": user_content
            })
            
            # Assistant message
            messages.append({
                "role": "assistant",
                "content": sample["output"]
            })
            
            conversations.append({"messages": messages})
        
        return conversations
    
    def split_dataset(
        self,
        data: List[dict],
        train_ratio: float = 0.9
    ) -> tuple:
        """划分数据集"""
        import random
        random.shuffle(data)
        
        split_idx = int(len(data) * train_ratio)
        
        return data[:split_idx], data[split_idx:]

# 使用
processor = DataProcessor()

# 加载数据
with open("raw_data.json", "r") as f:
    raw_data = json.load(f)

# 处理
processed = processor.process_dataset(raw_data)
train_data, eval_data = processor.split_dataset(processed)

# 保存
with open("train.json", "w") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

数据增强

class DataAugmenter:
    """数据增强"""
    
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4o-mini")
    
    def paraphrase(self, text: str) -> str:
        """改写"""
        prompt = f"请用不同的方式表达相同的意思：{text}"
        response = self.llm.invoke(prompt)
        return response.content
    
    def generate_variations(
        self,
        sample: dict,
        n: int = 3
    ) -> List[dict]:
        """生成变体"""
        variations = [sample]
        
        for _ in range(n):
            new_sample = sample.copy()
            
            # 改写 instruction
            new_sample["instruction"] = self.paraphrase(sample["instruction"])
            
            variations.append(new_sample)
        
        return variations
    
    def back_translate(self, text: str, lang: str = "en") -> str:
        """回译增强"""
        # 翻译到目标语言
        prompt1 = f"Translate to {lang}: {text}"
        translated = self.llm.invoke(prompt1).content
        
        # 翻译回来
        prompt2 = f"翻译成中文：{translated}"
        back = self.llm.invoke(prompt2).content
        
        return back

LoRA 与 PEFT 进阶

1. LoRA 参数深度解析

在配置 LoraConfig 时，三个核心参数决定了微调的效果：

Rank ($r$)：低秩矩阵的维度。通常 $r=8$ 或 $16$ 足够处理大多数任务。对于复杂的逻辑推理或多轮对话，可提升至 $64$ 或 $128$。
Alpha ($\alpha$)：缩放系数。LoRA 的实际权重更新为 $\Delta W \times \frac{\alpha}{r}$。通常设置 $\alpha = 2 \times r$。
Target Modules：目标模块。不仅要微调 q_proj 和 v_proj，在 Llama 架构中，微调 gate_proj, up_proj, down_proj 能显著提升性能。

2. ORPO：无需参考模型的偏好对齐

传统的 DPO 需要一个冻结的参考模型，这会翻倍显存占用。ORPO (Odds Ratio Preference Optimization) 通过在 SFT 损失函数中加入胜率比（Odds Ratio）惩罚，实现了单阶段的偏好对齐。

from trl import ORPOTrainer, ORPOConfig

def train_orpo(model, tokenizer, dataset):
    orpo_config = ORPOConfig(
        learning_rate=8e-6,
        lr_scheduler_type="cosine",
        max_length=1024,
        max_prompt_length=512,
        beta=0.1, # 惩罚强度
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        optim="paged_adamw_32bit",
    )

    trainer = ORPOTrainer(
        model=model,
        args=orpo_config,
        train_dataset=dataset,
        tokenizer=tokenizer,
    )
    trainer.train()

使用 Unsloth 实现极致加速

Unsloth 是目前最快的微调框架，通过手动优化的 Triton 内核，可实现 2x 的速度提升和 70% 的显存节省。

from unsloth import FastLanguageModel
import torch

# 1. 加载预量化模型
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 2048,
    load_in_4bit = True,
)

# 2. 配置 LoRA (Unsloth 优化版)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Unsloth 建议设为 0 以获得最佳性能
    bias = "none",    
    use_gradient_checkpointing = "unsloth", # 极致显存优化
)

# 3. 导出为 GGUF (用于 Ollama/LM Studio)
# model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")

QLoRA 微调

4-bit 量化训练

from transformers import BitsAndBytesConfig
import torch

class QLoRATrainer:
    """QLoRA 微调训练器"""
    
    def __init__(
        self,
        model_name: str,
        output_dir: str
    ):
        self.model_name = model_name
        self.output_dir = output_dir
        
        # 4-bit 量化配置
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )
        
        # 加载 tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # 加载量化模型
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=self.bnb_config,
            device_map="auto"
        )
        
        # 准备模型
        self.model = prepare_model_for_kbit_training(self.model)
        
        # LoRA 配置
        lora_config = LoraConfig(
            r=64,
            lora_alpha=128,
            lora_dropout=0.05,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj"
            ],
            task_type=TaskType.CAUSAL_LM
        )
        
        self.model = get_peft_model(self.model, lora_config)
        self.model.print_trainable_parameters()
    
    def train(self, train_dataset, eval_dataset=None):
        """训练"""
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=8,
            learning_rate=2e-4,
            warmup_ratio=0.03,
            lr_scheduler_type="cosine",
            logging_steps=10,
            save_steps=100,
            evaluation_strategy="steps" if eval_dataset else "no",
            eval_steps=100 if eval_dataset else None,
            fp16=True,
            optim="paged_adamw_32bit",
            gradient_checkpointing=True,
            max_grad_norm=0.3
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=DataCollatorForLanguageModeling(
                self.tokenizer,
                mlm=False
            )
        )
        
        trainer.train()
        self.model.save_pretrained(self.output_dir)

# 使用
qlora_trainer = QLoRATrainer(
    model_name="meta-llama/Llama-2-7b-hf",
    output_dir="./qlora_output"
)

全量微调

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
import torch

class FullFineTuner:
    """全量微调"""
    
    def __init__(
        self,
        model_name: str,
        output_dir: str
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # 全精度加载
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            use_flash_attention_2=True  # 如果支持
        )
        
        self.output_dir = output_dir
    
    def train(
        self,
        train_dataset,
        eval_dataset=None,
        num_epochs: int = 3
    ):
        """训练"""
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=16,
            learning_rate=2e-5,
            weight_decay=0.01,
            warmup_ratio=0.03,
            lr_scheduler_type="cosine",
            logging_steps=10,
            save_steps=500,
            bf16=True,
            gradient_checkpointing=True,
            deepspeed="ds_config.json",  # DeepSpeed 配置
            fsdp="full_shard auto_wrap",  # FSDP
            fsdp_transformer_layer_cls_to_wrap="LlamaDecoderLayer"
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset
        )
        
        trainer.train()
        trainer.save_model()

DeepSpeed 配置

{
    "bf16": {
        "enabled": true
    },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },
    "gradient_accumulation_steps": 16,
    "gradient_clipping": 1.0,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": 1
}

使用 Unsloth 加速

from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

class UnslothTrainer:
    """Unsloth 加速训练"""
    
    def __init__(
        self,
        model_name: str = "unsloth/llama-3-8b-bnb-4bit",
        max_seq_length: int = 2048
    ):
        # 加载模型（自动 4-bit 量化）
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_name,
            max_seq_length=max_seq_length,
            dtype=None,
            load_in_4bit=True
        )
        
        # 添加 LoRA
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=16,
            lora_alpha=16,
            lora_dropout=0,
            target_modules=[
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj"
            ],
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=42
        )
    
    def train(self, dataset, output_dir: str):
        """训练"""
        trainer = SFTTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            train_dataset=dataset,
            dataset_text_field="text",
            max_seq_length=2048,
            args=TrainingArguments(
                output_dir=output_dir,
                per_device_train_batch_size=2,
                gradient_accumulation_steps=4,
                num_train_epochs=3,
                learning_rate=2e-4,
                fp16=not torch.cuda.is_bf16_supported(),
                bf16=torch.cuda.is_bf16_supported(),
                logging_steps=10,
                optim="adamw_8bit",
                weight_decay=0.01,
                warmup_steps=10,
                lr_scheduler_type="linear",
                seed=42
            )
        )
        
        trainer.train()
        self.model.save_pretrained(output_dir)

# 使用
unsloth_trainer = UnslothTrainer("unsloth/llama-3-8b-bnb-4bit")
unsloth_trainer.train(dataset, "./unsloth_output")

DPO 偏好对齐

from trl import DPOTrainer, DPOConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

class DPOFineTuner:
    """DPO 偏好对齐"""
    
    def __init__(self, model_name: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16
        )
        
        # 参考模型（冻结）
        self.ref_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16
        )
    
    def prepare_dataset(self, data: list):
        """准备偏好数据"""
        # 格式: {"prompt": ..., "chosen": ..., "rejected": ...}
        from datasets import Dataset
        return Dataset.from_list(data)
    
    def train(self, dataset, output_dir: str):
        """DPO 训练"""
        dpo_config = DPOConfig(
            output_dir=output_dir,
            num_train_epochs=1,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            learning_rate=5e-7,
            beta=0.1,  # DPO 温度参数
            max_length=1024,
            max_prompt_length=512,
            logging_steps=10,
            save_steps=100
        )
        
        trainer = DPOTrainer(
            model=self.model,
            ref_model=self.ref_model,
            args=dpo_config,
            train_dataset=dataset,
            tokenizer=self.tokenizer
        )
        
        trainer.train()
        trainer.save_model()

# 使用
dpo_trainer = DPOFineTuner("meta-llama/Llama-2-7b-chat-hf")

preference_data = [
    {
        "prompt": "如何学习编程？",
        "chosen": "学习编程建议从Python开始，它语法简洁...",
        "rejected": "随便找个教程看看就行"
    }
]

dataset = dpo_trainer.prepare_dataset(preference_data)
dpo_trainer.train(dataset, "./dpo_output")

评估与部署

模型评估

from transformers import pipeline
import evaluate

class ModelEvaluator:
    """模型评估"""
    
    def __init__(self, model_path: str):
        self.pipe = pipeline(
            "text-generation",
            model=model_path,
            device_map="auto"
        )
    
    def evaluate_generation(self, test_data: list) -> dict:
        """评估生成质量"""
        predictions = []
        references = []
        
        for sample in test_data:
            prompt = sample["instruction"]
            if sample.get("input"):
                prompt += f"\n{sample['input']}"
            
            output = self.pipe(
                prompt,
                max_new_tokens=256,
                do_sample=False
            )[0]["generated_text"]
            
            # 提取生成的回答
            generated = output[len(prompt):].strip()
            
            predictions.append(generated)
            references.append(sample["output"])
        
        # 计算指标
        rouge = evaluate.load("rouge")
        results = rouge.compute(
            predictions=predictions,
            references=references
        )
        
        return results
    
    def human_eval(self, samples: list, num_samples: int = 50) -> dict:
        """准备人工评估"""
        import random
        
        eval_samples = random.sample(samples, min(num_samples, len(samples)))
        
        results = []
        for sample in eval_samples:
            output = self.pipe(sample["instruction"], max_new_tokens=256)
            results.append({
                "instruction": sample["instruction"],
                "generated": output[0]["generated_text"],
                "reference": sample["output"]
            })
        
        return results

# 使用
evaluator = ModelEvaluator("./merged_model")
results = evaluator.evaluate_generation(test_data)
print(f"ROUGE-L: {results['rougeL']}")

最佳实践

项目	建议
数据质量	质量 > 数量
LoRA rank	8-64，任务复杂度决定
Learning rate	LoRA: 1e-4 ~ 3e-4
Batch size	越大越稳定
Epochs	1-3，避免过拟合
评估	持续监控 loss

参考资源

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《 LLM应用开发——微调实战指南》

本文链接：http://localhost:3015/ai/LLM%E5%BE%AE%E8%B0%83%E5%AE%9E%E6%88%98.html

本文最后一次更新为天前，文章中的某些内容可能已过时！