LoRA、QLoRA与全量微调完整教程
前言
当预训练模型无法满足特定领域需求时,微调是提升模型性能的关键手段。本文详细介绍 LLM 微调的完整流程,包括数据准备、LoRA/QLoRA 参数高效微调、全量微调等技术。
微调概述
何时需要微调
┌─────────────────────────────────────────────────────────────────┐
│ 是否需要微调决策树 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 基础模型能完成任务吗? │
│ │ │
│ ├── 是 → Prompt Engineering 足够 │
│ │ │
│ └── 否 → RAG 能解决吗? │
│ │ │
│ ├── 是 → 使用 RAG │
│ │ │
│ └── 否 → 需要微调 │
│ │ │
│ ├── 数据量 < 1000 → Few-shot │
│ ├── 数据量 < 10000 → LoRA │
│ └── 数据量 > 10000 → Full FT │
│ │
└─────────────────────────────────────────────────────────────────┘
微调方法对比
| 方法 | 参数量 | 显存需求 | 训练速度 | 效果 |
|---|---|---|---|---|
| 全量微调 | 100% | 极高 | 慢 | 最好 |
| LoRA | 0.1-1% | 低 | 快 | 很好 |
| QLoRA | 0.1-1% | 极低 | 较快 | 好 |
| Adapter | 1-5% | 中 | 中 | 好 |
| Prefix Tuning | <1% | 低 | 快 | 一般 |
数据准备
数据格式
# 指令微调数据格式
instruction_data = [
{
"instruction": "将以下文本翻译成英文",
"input": "今天天气很好",
"output": "The weather is nice today"
},
{
"instruction": "总结以下文章的要点",
"input": "人工智能(AI)是计算机科学的一个分支...",
"output": "文章主要介绍了AI的定义、发展历程和应用领域"
}
]
# 对话微调数据格式
conversation_data = [
{
"messages": [
{"role": "system", "content": "你是一个专业的医疗助手"},
{"role": "user", "content": "头疼怎么办?"},
{"role": "assistant", "content": "头疼可能有多种原因..."}
]
}
]
# 偏好数据格式(DPO/RLHF)
preference_data = [
{
"prompt": "如何学习编程?",
"chosen": "学习编程建议从基础开始...",
"rejected": "随便学学就行了"
}
]
数据清洗与处理
import json
from typing import List, Dict
import re
class DataProcessor:
"""数据处理器"""
def __init__(self, max_length: int = 2048):
self.max_length = max_length
def clean_text(self, text: str) -> str:
"""清洗文本"""
# 移除多余空白
text = re.sub(r'\s+', ' ', text)
# 移除特殊字符
text = re.sub(r'[^\w\s\u4e00-\u9fff.,!?,。!?]', '', text)
return text.strip()
def validate_sample(self, sample: dict) -> bool:
"""验证样本"""
required_fields = ["instruction", "output"]
for field in required_fields:
if field not in sample or not sample[field]:
return False
# 长度检查
total_length = len(sample.get("instruction", "")) + \
len(sample.get("input", "")) + \
len(sample.get("output", ""))
if total_length > self.max_length * 4: # 粗略估计
return False
return True
def process_dataset(self, data: List[dict]) -> List[dict]:
"""处理数据集"""
processed = []
for sample in data:
if not self.validate_sample(sample):
continue
processed_sample = {
"instruction": self.clean_text(sample["instruction"]),
"input": self.clean_text(sample.get("input", "")),
"output": self.clean_text(sample["output"])
}
processed.append(processed_sample)
return processed
def convert_to_conversation(self, data: List[dict]) -> List[dict]:
"""转换为对话格式"""
conversations = []
for sample in data:
messages = []
# System message (optional)
if sample.get("system"):
messages.append({
"role": "system",
"content": sample["system"]
})
# User message
user_content = sample["instruction"]
if sample.get("input"):
user_content += f"\n\n{sample['input']}"
messages.append({
"role": "user",
"content": user_content
})
# Assistant message
messages.append({
"role": "assistant",
"content": sample["output"]
})
conversations.append({"messages": messages})
return conversations
def split_dataset(
self,
data: List[dict],
train_ratio: float = 0.9
) -> tuple:
"""划分数据集"""
import random
random.shuffle(data)
split_idx = int(len(data) * train_ratio)
return data[:split_idx], data[split_idx:]
# 使用
processor = DataProcessor()
# 加载数据
with open("raw_data.json", "r") as f:
raw_data = json.load(f)
# 处理
processed = processor.process_dataset(raw_data)
train_data, eval_data = processor.split_dataset(processed)
# 保存
with open("train.json", "w") as f:
json.dump(train_data, f, ensure_ascii=False, indent=2)
数据增强
class DataAugmenter:
"""数据增强"""
def __init__(self):
self.llm = ChatOpenAI(model="gpt-4o-mini")
def paraphrase(self, text: str) -> str:
"""改写"""
prompt = f"请用不同的方式表达相同的意思:{text}"
response = self.llm.invoke(prompt)
return response.content
def generate_variations(
self,
sample: dict,
n: int = 3
) -> List[dict]:
"""生成变体"""
variations = [sample]
for _ in range(n):
new_sample = sample.copy()
# 改写 instruction
new_sample["instruction"] = self.paraphrase(sample["instruction"])
variations.append(new_sample)
return variations
def back_translate(self, text: str, lang: str = "en") -> str:
"""回译增强"""
# 翻译到目标语言
prompt1 = f"Translate to {lang}: {text}"
translated = self.llm.invoke(prompt1).content
# 翻译回来
prompt2 = f"翻译成中文:{translated}"
back = self.llm.invoke(prompt2).content
return back
LoRA 与 PEFT 进阶
1. LoRA 参数深度解析
在配置 LoraConfig 时,三个核心参数决定了微调的效果:
- Rank ($r$):低秩矩阵的维度。通常 $r=8$ 或 $16$ 足够处理大多数任务。对于复杂的逻辑推理或多轮对话,可提升至 $64$ 或 $128$。
- Alpha ($\alpha$):缩放系数。LoRA 的实际权重更新为 $\Delta W \times \frac{\alpha}{r}$。通常设置 $\alpha = 2 \times r$。
-
Target Modules:目标模块。不仅要微调
q_proj和v_proj,在 Llama 架构中,微调gate_proj,up_proj,down_proj能显著提升性能。
2. ORPO:无需参考模型的偏好对齐
传统的 DPO 需要一个冻结的参考模型,这会翻倍显存占用。ORPO (Odds Ratio Preference Optimization) 通过在 SFT 损失函数中加入胜率比(Odds Ratio)惩罚,实现了单阶段的偏好对齐。
from trl import ORPOTrainer, ORPOConfig
def train_orpo(model, tokenizer, dataset):
orpo_config = ORPOConfig(
learning_rate=8e-6,
lr_scheduler_type="cosine",
max_length=1024,
max_prompt_length=512,
beta=0.1, # 惩罚强度
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=1,
optim="paged_adamw_32bit",
)
trainer = ORPOTrainer(
model=model,
args=orpo_config,
train_dataset=dataset,
tokenizer=tokenizer,
)
trainer.train()
使用 Unsloth 实现极致加速
Unsloth 是目前最快的微调框架,通过手动优化的 Triton 内核,可实现 2x 的速度提升和 70% 的显存节省。
from unsloth import FastLanguageModel
import torch
# 1. 加载预量化模型
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/llama-3-8b-bnb-4bit",
max_seq_length = 2048,
load_in_4bit = True,
)
# 2. 配置 LoRA (Unsloth 优化版)
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 32,
lora_dropout = 0, # Unsloth 建议设为 0 以获得最佳性能
bias = "none",
use_gradient_checkpointing = "unsloth", # 极致显存优化
)
# 3. 导出为 GGUF (用于 Ollama/LM Studio)
# model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
QLoRA 微调
4-bit 量化训练
from transformers import BitsAndBytesConfig
import torch
class QLoRATrainer:
"""QLoRA 微调训练器"""
def __init__(
self,
model_name: str,
output_dir: str
):
self.model_name = model_name
self.output_dir = output_dir
# 4-bit 量化配置
self.bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
# 加载 tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# 加载量化模型
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=self.bnb_config,
device_map="auto"
)
# 准备模型
self.model = prepare_model_for_kbit_training(self.model)
# LoRA 配置
lora_config = LoraConfig(
r=64,
lora_alpha=128,
lora_dropout=0.05,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
task_type=TaskType.CAUSAL_LM
)
self.model = get_peft_model(self.model, lora_config)
self.model.print_trainable_parameters()
def train(self, train_dataset, eval_dataset=None):
"""训练"""
training_args = TrainingArguments(
output_dir=self.output_dir,
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
learning_rate=2e-4,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=100,
evaluation_strategy="steps" if eval_dataset else "no",
eval_steps=100 if eval_dataset else None,
fp16=True,
optim="paged_adamw_32bit",
gradient_checkpointing=True,
max_grad_norm=0.3
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=DataCollatorForLanguageModeling(
self.tokenizer,
mlm=False
)
)
trainer.train()
self.model.save_pretrained(self.output_dir)
# 使用
qlora_trainer = QLoRATrainer(
model_name="meta-llama/Llama-2-7b-hf",
output_dir="./qlora_output"
)
全量微调
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer
)
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
import torch
class FullFineTuner:
"""全量微调"""
def __init__(
self,
model_name: str,
output_dir: str
):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# 全精度加载
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
use_flash_attention_2=True # 如果支持
)
self.output_dir = output_dir
def train(
self,
train_dataset,
eval_dataset=None,
num_epochs: int = 3
):
"""训练"""
training_args = TrainingArguments(
output_dir=self.output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
learning_rate=2e-5,
weight_decay=0.01,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_steps=500,
bf16=True,
gradient_checkpointing=True,
deepspeed="ds_config.json", # DeepSpeed 配置
fsdp="full_shard auto_wrap", # FSDP
fsdp_transformer_layer_cls_to_wrap="LlamaDecoderLayer"
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)
trainer.train()
trainer.save_model()
DeepSpeed 配置
{
"bf16": {
"enabled": true
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": 16,
"gradient_clipping": 1.0,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": 1
}
使用 Unsloth 加速
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
class UnslothTrainer:
"""Unsloth 加速训练"""
def __init__(
self,
model_name: str = "unsloth/llama-3-8b-bnb-4bit",
max_seq_length: int = 2048
):
# 加载模型(自动 4-bit 量化)
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=None,
load_in_4bit=True
)
# 添加 LoRA
self.model = FastLanguageModel.get_peft_model(
self.model,
r=16,
lora_alpha=16,
lora_dropout=0,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42
)
def train(self, dataset, output_dir: str):
"""训练"""
trainer = SFTTrainer(
model=self.model,
tokenizer=self.tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=2048,
args=TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=3,
learning_rate=2e-4,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=10,
optim="adamw_8bit",
weight_decay=0.01,
warmup_steps=10,
lr_scheduler_type="linear",
seed=42
)
)
trainer.train()
self.model.save_pretrained(output_dir)
# 使用
unsloth_trainer = UnslothTrainer("unsloth/llama-3-8b-bnb-4bit")
unsloth_trainer.train(dataset, "./unsloth_output")
DPO 偏好对齐
from trl import DPOTrainer, DPOConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
class DPOFineTuner:
"""DPO 偏好对齐"""
def __init__(self, model_name: str):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16
)
# 参考模型(冻结)
self.ref_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16
)
def prepare_dataset(self, data: list):
"""准备偏好数据"""
# 格式: {"prompt": ..., "chosen": ..., "rejected": ...}
from datasets import Dataset
return Dataset.from_list(data)
def train(self, dataset, output_dir: str):
"""DPO 训练"""
dpo_config = DPOConfig(
output_dir=output_dir,
num_train_epochs=1,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=5e-7,
beta=0.1, # DPO 温度参数
max_length=1024,
max_prompt_length=512,
logging_steps=10,
save_steps=100
)
trainer = DPOTrainer(
model=self.model,
ref_model=self.ref_model,
args=dpo_config,
train_dataset=dataset,
tokenizer=self.tokenizer
)
trainer.train()
trainer.save_model()
# 使用
dpo_trainer = DPOFineTuner("meta-llama/Llama-2-7b-chat-hf")
preference_data = [
{
"prompt": "如何学习编程?",
"chosen": "学习编程建议从Python开始,它语法简洁...",
"rejected": "随便找个教程看看就行"
}
]
dataset = dpo_trainer.prepare_dataset(preference_data)
dpo_trainer.train(dataset, "./dpo_output")
评估与部署
模型评估
from transformers import pipeline
import evaluate
class ModelEvaluator:
"""模型评估"""
def __init__(self, model_path: str):
self.pipe = pipeline(
"text-generation",
model=model_path,
device_map="auto"
)
def evaluate_generation(self, test_data: list) -> dict:
"""评估生成质量"""
predictions = []
references = []
for sample in test_data:
prompt = sample["instruction"]
if sample.get("input"):
prompt += f"\n{sample['input']}"
output = self.pipe(
prompt,
max_new_tokens=256,
do_sample=False
)[0]["generated_text"]
# 提取生成的回答
generated = output[len(prompt):].strip()
predictions.append(generated)
references.append(sample["output"])
# 计算指标
rouge = evaluate.load("rouge")
results = rouge.compute(
predictions=predictions,
references=references
)
return results
def human_eval(self, samples: list, num_samples: int = 50) -> dict:
"""准备人工评估"""
import random
eval_samples = random.sample(samples, min(num_samples, len(samples)))
results = []
for sample in eval_samples:
output = self.pipe(sample["instruction"], max_new_tokens=256)
results.append({
"instruction": sample["instruction"],
"generated": output[0]["generated_text"],
"reference": sample["output"]
})
return results
# 使用
evaluator = ModelEvaluator("./merged_model")
results = evaluator.evaluate_generation(test_data)
print(f"ROUGE-L: {results['rougeL']}")
最佳实践
| 项目 | 建议 |
|---|---|
| 数据质量 | 质量 > 数量 |
| LoRA rank | 8-64,任务复杂度决定 |
| Learning rate | LoRA: 1e-4 ~ 3e-4 |
| Batch size | 越大越稳定 |
| Epochs | 1-3,避免过拟合 |
| 评估 | 持续监控 loss |
参考资源
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 LLM应用开发——微调实战指南 》
本文链接:http://localhost:3015/ai/LLM%E5%BE%AE%E8%B0%83%E5%AE%9E%E6%88%98.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!