03 - 推理服务部署(全面版)¶
⚠️ 时效性说明:本章涉及前沿模型/价格/榜单等信息,可能随版本快速变化;请以论文原文、官方发布页和 API 文档为准。
📌 定位说明:本章侧重部署架构与推理框架深度对比。 - 📖 应用开发者快速部署指南请参考 LLM 应用/11-大模型部署 - 📖 MLOps 全链路工程化部署请参考 MLOps 与 AI 工程化/02-模型部署与服务化
学习目标:掌握大模型推理服务的部署技术,包括 vLLM 、 TGI 、量化部署和 API 服务封装。
目录¶
推理部署概述¶
1.1 推理 vs 训练¶
Text Only
训练阶段 vs 推理阶段
训练阶段:
├── 目标:更新模型参数
├── 计算:前向 + 反向传播
├── 内存:存储参数、梯度、优化器状态
├── 批处理:大batch,追求吞吐
└── 精度:FP32/BF16/FP16
推理阶段:
├── 目标:生成预测结果
├── 计算:仅前向传播
├── 内存:只需模型参数 + KV Cache
├── 批处理:动态batch,追求延迟
└── 精度:可量化到INT8/INT4
推理优化的核心目标:
├── 低延迟(Latency):快速响应
├── 高吞吐(Throughput):处理更多请求
├── 低成本(Cost):减少资源消耗
└── 高可用(Availability):稳定服务
1.2 推理架构选择¶
Text Only
┌─────────────────────────────────────────────────────────────────┐
│ 推理部署架构选择 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. 本地部署(Local Deployment) │
│ ├── 适用:开发测试、个人使用 │
│ ├── 工具:Transformers、llama.cpp │
│ └── 特点:简单易用,资源有限 │
│ │
│ 2. 服务器部署(Server Deployment) │
│ ├── 适用:生产环境、API服务 │
│ ├── 工具:vLLM、TGI、TensorRT-LLM │
│ └── 特点:高性能,支持并发 │
│ │
│ 3. 云端部署(Cloud Deployment) │
│ ├── 适用:弹性伸缩、大规模服务 │
│ ├── 平台:AWS SageMaker、Azure ML、GCP Vertex AI │
│ └── 特点:托管服务,自动扩缩容 │
│ │
│ 4. 边缘部署(Edge Deployment) │
│ ├── 适用:移动设备、嵌入式系统 │
│ ├── 工具:MLC-LLM、Qualcomm AI Stack │
│ └── 特点:极致量化,低功耗 │
│ │
└─────────────────────────────────────────────────────────────────┘
vLLM 部署实战¶
2.1 vLLM 核心特性¶
Text Only
vLLM核心优势:
1. PagedAttention
├── 将KV Cache分页管理
├── 减少内存碎片
└── 提高内存利用率
2. 连续批处理(Continuous Batching)
├── 动态添加新请求
├── 请求完成后立即释放资源
└── 提高吞吐率
3. 量化支持
├── GPTQ、AWQ、SqueezeLLM
└── 降低内存占用
4. 张量并行
├── 多卡推理
└── 支持大模型
2.2 vLLM 安装与基础使用¶
Bash
# 安装vLLM
pip install vllm
# 对于CUDA 11.8
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu118
# 对于CUDA 12.1
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu121
Python
# 基础推理示例
from vllm import LLM, SamplingParams
# 加载模型
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=1, # 单卡
gpu_memory_utilization=0.9, # GPU内存使用率
)
# 设置采样参数
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=256,
)
# 生成
prompts = [
"The future of AI is",
"In the beginning,",
"Once upon a time,",
]
outputs = llm.generate(prompts, sampling_params)
# 打印结果
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Generated: {generated_text!r}")
print("-" * 50)
2.3 vLLM 高级配置¶
Python
from vllm import LLM, SamplingParams
class VLLMDeployment:
"""
vLLM高级部署配置
"""
def __init__(self, model_path, config=None):
self.config = config or {}
# 初始化LLM
self.llm = LLM(
model=model_path,
# 并行配置
tensor_parallel_size=self.config.get('tensor_parallel_size', 1),
pipeline_parallel_size=self.config.get('pipeline_parallel_size', 1),
# 内存配置
gpu_memory_utilization=self.config.get('gpu_memory_utilization', 0.9),
max_num_seqs=self.config.get('max_num_seqs', 256),
max_model_len=self.config.get('max_model_len', 4096),
# 量化配置
quantization=self.config.get('quantization', None),
# 可选: 'awq', 'gptq', 'squeezellm'
# 其他配置
dtype=self.config.get('dtype', 'auto'),
# 可选: 'float16', 'bfloat16', 'float32'
trust_remote_code=self.config.get('trust_remote_code', True),
)
def generate(self, prompts, **sampling_kwargs):
"""
批量生成
"""
sampling_params = SamplingParams(
temperature=sampling_kwargs.get('temperature', 0.7),
top_p=sampling_kwargs.get('top_p', 0.9),
top_k=sampling_kwargs.get('top_k', -1),
max_tokens=sampling_kwargs.get('max_tokens', 256),
presence_penalty=sampling_kwargs.get('presence_penalty', 0.0),
frequency_penalty=sampling_kwargs.get('frequency_penalty', 0.0),
stop=sampling_kwargs.get('stop', None),
)
outputs = self.llm.generate(prompts, sampling_params)
# 格式化输出
results = []
for output in outputs:
results.append({
'prompt': output.prompt,
'text': output.outputs[0].text,
'tokens': len(output.outputs[0].token_ids),
})
return results
def chat(self, messages, **kwargs): # **kwargs收集关键字参数
"""
对话模式
"""
# 构建prompt
prompt = self._build_chat_prompt(messages)
# 生成
result = self.generate([prompt], **kwargs)
return result[0]['text']
def _build_chat_prompt(self, messages):
"""
构建对话prompt
"""
# 使用模型特定的chat template
if hasattr(self.llm.get_tokenizer(), 'apply_chat_template'): # hasattr检查对象是否有某属性
prompt = self.llm.get_tokenizer().apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
else:
# 简单的对话格式
prompt = ""
for msg in messages:
role = msg['role']
content = msg['content']
prompt += f"{role}: {content}\n"
prompt += "assistant:"
return prompt
# 配置示例
VLLM_CONFIG = {
'tensor_parallel_size': 2, # 2卡并行
'gpu_memory_utilization': 0.85,
'max_num_seqs': 128,
'max_model_len': 8192,
'quantization': None,
'dtype': 'bfloat16',
}
# 使用示例
deployment = VLLMDeployment("meta-llama/Llama-2-7b-hf", VLLM_CONFIG)
# 批量生成
results = deployment.generate(
["Hello, how are you?", "What is machine learning?"],
temperature=0.7,
max_tokens=100
)
# 对话模式
response = deployment.chat([
{'role': 'user', 'content': 'Explain quantum computing'}
])
2.4 vLLM 服务部署¶
Python
# 启动vLLM OpenAI兼容API服务
# 命令行方式(vLLM 0.6+推荐使用 vllm serve)
"""
vllm serve meta-llama/Llama-2-7b-hf \
--tensor-parallel-size 2 \
--gpu-memory-utilization 0.85 \
--max-num-seqs 256 \
--port 8000
# 旧版本也可使用:
# python -m vllm.entrypoints.openai.api_server \
# --model meta-llama/Llama-2-7b-hf ...
"""
# 使用Python自定义服务
import uvicorn
from fastapi import FastAPI
from vllm import LLM, SamplingParams
app = FastAPI()
# 全局模型实例
llm_engine = None
@app.on_event("startup")
async def startup_event(): # async def定义协程函数
global llm_engine
llm_engine = LLM(
model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=2,
)
@app.post("/v1/completions")
async def create_completion(request: CompletionRequest):
"""
OpenAI兼容的completions API
"""
sampling_params = SamplingParams(
temperature=request.temperature,
max_tokens=request.max_tokens,
top_p=request.top_p,
)
outputs = llm_engine.generate(request.prompt, sampling_params)
return {
"id": "cmpl-" + str(uuid.uuid4()),
"object": "text_completion",
"created": int(time.time()),
"model": request.model,
"choices": [
{
"text": output.outputs[0].text,
"index": i,
"logprobs": None,
"finish_reason": "stop"
}
for i, output in enumerate(outputs) # enumerate同时获取索引和元素
]
}
@app.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest):
"""
OpenAI兼容的chat completions API
"""
# 构建prompt
prompt = build_chat_prompt(request.messages)
sampling_params = SamplingParams(
temperature=request.temperature,
max_tokens=request.max_tokens,
)
outputs = llm_engine.generate(prompt, sampling_params)
return {
"id": "chatcmpl-" + str(uuid.uuid4()),
"object": "chat.completion",
"created": int(time.time()),
"model": request.model,
"choices": [
{
"index": i,
"message": {
"role": "assistant",
"content": output.outputs[0].text
},
"finish_reason": "stop"
}
for i, output in enumerate(outputs)
]
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
Text Generation Inference (TGI)¶
3.1 TGI 简介¶
Text Only
TGI (Text Generation Inference)
├── 开发者:Hugging Face
├── 特点:
│ ├── 生产级推理服务器
│ ├── 支持连续批处理
│ ├── 支持流式生成
│ ├── 支持Safetensors格式
│ └── 支持量化(GPTQ、AWQ)
└── 适用:生产环境部署
3.2 TGI 安装与使用¶
Bash
# Docker方式安装(推荐)
docker run --gpus all --shm-size 1g -p 8080:80 \
-v $(pwd)/data:/data \
ghcr.io/huggingface/text-generation-inference:1.4 \
--model-id meta-llama/Llama-2-7b-hf \
--num-shard 2 \
--quantize bitsandbytes
# 本地安装
pip install text-generation
Python
# Python客户端
from text_generation import Client
client = Client("http://localhost:8080")
# 文本生成
text = client.generate(
"The future of AI is",
max_new_tokens=100,
temperature=0.7,
top_p=0.9,
).generated_text
print(text)
# 流式生成
for response in client.generate_stream(
"Explain machine learning:",
max_new_tokens=100
):
print(response.token.text, end="", flush=True)
3.3 TGI 高级配置¶
Bash
# 启动参数说明
docker run --gpus all --shm-size 1g -p 8080:80 \
-v $(pwd)/data:/data \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id meta-llama/Llama-2-7b-hf \
--revision main \
--sharded true \
--num-shard 2 \
--quantize bitsandbytes-nf4 \
--max-input-length 4096 \
--max-total-tokens 8192 \
--max-batch-prefill-tokens 16384 \
--max-batch-total-tokens 32768
# 参数说明:
# --model-id: 模型ID或本地路径
# --sharded: 是否使用模型分片
# --num-shard: 分片数量(GPU数量)
# --quantize: 量化方式 (bitsandbytes, bitsandbytes-nf4, bitsandbytes-fp4, gptq, awq, eetq)
# --max-input-length: 最大输入长度
# --max-total-tokens: 最大总token数(输入+输出)
# --max-batch-prefill-tokens: 预填充阶段最大batch token数
# --max-batch-total-tokens: 生成阶段最大batch token数
3.4 TGI 与 vLLM 对比¶
| 特性 | TGI | vLLM |
|---|---|---|
| PagedAttention | ✅ | ✅ |
| 连续批处理 | ✅ | ✅ |
| 量化支持 | GPTQ, AWQ, BnB | GPTQ, AWQ, SqueezeLLM |
| 张量并行 | ✅ | ✅ |
| 流水线并行 | ❌ | ✅ |
| 流式输出 | ✅ | ✅ |
| OpenAI API | 部分兼容 | ✅ 完整兼容 |
| 生产就绪 | ✅ 更成熟 | ✅ 高性能 |
| 易用性 | 中等 | 高 |
量化部署¶
4.1 量化方法对比¶
Text Only
量化方法对比
═══════════════════════════════════════════════════════════════════
方法 精度 压缩比 性能损失 适用场景
─────────────────────────────────────────────────────────────────
FP16 16bit 2x 无 通用
BF16 16bit 2x 无 Ampere+ GPU
INT8 8bit 4x <1% 通用
INT4 (GPTQ) 4bit 8x 1-3% 本地部署
INT4 (AWQ) 4bit 8x <1% 高质量需求
NF4 (QLoRA) 4bit 8x 1-2% 微调+推理
FP4 4bit 8x 2-4% 实验性
═══════════════════════════════════════════════════════════════════
4.2 GPTQ 量化部署¶
Python
# 使用AutoGPTQ进行量化
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
# 量化配置
quantize_config = BaseQuantizeConfig(
bits=4, # 4-bit量化
group_size=128, # 分组大小
desc_act=False, # 是否降序激活
)
# 加载并量化模型
model = AutoGPTQForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantize_config,
)
# 准备校准数据
calib_data = [
"auto-gptq is an easy-to-use model quantization library",
"with user-friendly apis",
# ... 更多校准数据
]
# 执行量化
model.quantize(calib_data)
# 保存量化模型
model.save_quantized("Llama-2-7b-4bit-gptq")
# 加载量化模型进行推理
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoGPTQForCausalLM.from_quantized(
"Llama-2-7b-4bit-gptq",
device="cuda:0",
)
# 生成
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))
4.3 AWQ 量化部署¶
Python
# AWQ量化(通常比GPTQ质量更好)
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
# 加载模型
model_path = "meta-llama/Llama-2-7b-hf"
quant_path = "Llama-2-7b-awq"
# 量化配置
quant_config = {
"zero_point": True,
"q_group_size": 128,
"w_bit": 4,
"version": "GEMM"
}
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# 准备校准数据
examples = [
tokenizer("auto-gptq is an easy-to-use model quantization library")
for _ in range(8)
]
# 量化
model.quantize(
tokenizer,
quant_config=quant_config,
calib_data=examples,
)
# 保存
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
# 加载量化模型
model = AutoAWQForCausalLM.from_quantized(
quant_path,
fuse_layers=True, # 融合层以加速
)
4.4 vLLM 中的量化部署¶
Python
# vLLM支持多种量化方式
# 1. AWQ量化
llm = LLM(
model="TheBloke/Llama-2-7B-AWQ",
quantization="awq",
tensor_parallel_size=1,
)
# 2. GPTQ量化
llm = LLM(
model="TheBloke/Llama-2-7B-GPTQ",
quantization="gptq",
tensor_parallel_size=1,
)
# 3. SqueezeLLM量化
llm = LLM(
model="path/to/squeezellm/model",
quantization="squeezellm",
)
API 服务封装¶
5.1 FastAPI 封装¶
Python
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn
from vllm import LLM, SamplingParams
app = FastAPI(title="LLM Inference API")
# 全局模型
llm = None
class CompletionRequest(BaseModel): # Pydantic BaseModel:自动数据验证和序列化
model: str
prompt: str
max_tokens: Optional[int] = 256 # Optional表示值可以为None
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.9
top_k: Optional[int] = -1
stop: Optional[List[str]] = None
stream: Optional[bool] = False
class ChatCompletionRequest(BaseModel):
model: str
messages: List[dict]
max_tokens: Optional[int] = 256
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.9
stream: Optional[bool] = False
@app.on_event("startup")
async def load_model():
global llm
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
tensor_parallel_size=1,
gpu_memory_utilization=0.9,
)
print("Model loaded successfully")
@app.post("/v1/completions")
async def create_completion(request: CompletionRequest):
try: # try/except捕获异常,防止程序崩溃
sampling_params = SamplingParams(
temperature=request.temperature,
top_p=request.top_p,
top_k=request.top_k,
max_tokens=request.max_tokens,
stop=request.stop,
)
outputs = llm.generate(request.prompt, sampling_params)
return {
"id": f"cmpl-{uuid.uuid4()}",
"object": "text_completion",
"created": int(time.time()),
"model": request.model,
"choices": [
{
"text": output.outputs[0].text,
"index": i,
"logprobs": None,
"finish_reason": "stop"
}
for i, output in enumerate(outputs)
],
"usage": {
"prompt_tokens": len(outputs[0].prompt_token_ids),
"completion_tokens": len(outputs[0].outputs[0].token_ids),
"total_tokens": len(outputs[0].prompt_token_ids) + len(outputs[0].outputs[0].token_ids)
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest):
try:
# 构建chat prompt
prompt = build_chat_prompt(request.messages)
sampling_params = SamplingParams(
temperature=request.temperature,
top_p=request.top_p,
max_tokens=request.max_tokens,
)
outputs = llm.generate(prompt, sampling_params)
return {
"id": f"chatcmpl-{uuid.uuid4()}",
"object": "chat.completion",
"created": int(time.time()),
"model": request.model,
"choices": [
{
"index": i,
"message": {
"role": "assistant",
"content": output.outputs[0].text
},
"finish_reason": "stop"
}
for i, output in enumerate(outputs)
],
"usage": {
"prompt_tokens": len(outputs[0].prompt_token_ids),
"completion_tokens": len(outputs[0].outputs[0].token_ids),
"total_tokens": len(outputs[0].prompt_token_ids) + len(outputs[0].outputs[0].token_ids)
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
def build_chat_prompt(messages):
"""构建chat prompt"""
prompt = ""
for msg in messages:
role = msg.get('role', '')
content = msg.get('content', '')
if role == 'system':
prompt += f"[INST] <<SYS>>\n{content}\n<</SYS>>\n\n"
elif role == 'user':
prompt += f"{content} [/INST]"
elif role == 'assistant':
prompt += f" {content} </s><s>[INST] "
return prompt
@app.get("/health")
async def health_check():
return {"status": "healthy", "model_loaded": llm is not None}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
5.2 客户端调用示例¶
Python
import requests
import json
class LLMClient:
"""
LLM API客户端
"""
def __init__(self, base_url="http://localhost:8000"):
self.base_url = base_url
def complete(self, prompt, **kwargs):
"""
文本补全
"""
response = requests.post(
f"{self.base_url}/v1/completions",
json={
"model": "llama-2-7b",
"prompt": prompt,
**kwargs
}
)
return response.json()
def chat(self, messages, **kwargs):
"""
对话
"""
response = requests.post(
f"{self.base_url}/v1/chat/completions",
json={
"model": "llama-2-7b",
"messages": messages,
**kwargs
}
)
return response.json()
# 使用示例
client = LLMClient()
# 文本补全
result = client.complete(
"The future of AI is",
max_tokens=100,
temperature=0.7
)
print(result['choices'][0]['text'])
# 对话
result = client.chat([
{"role": "user", "content": "Explain quantum computing"}
])
print(result['choices'][0]['message']['content'])
性能优化与监控¶
6.1 性能指标¶
Python
class PerformanceMonitor:
"""
性能监控器
"""
def __init__(self):
self.metrics = {
'latency': [], # 延迟(秒)
'throughput': [], # 吞吐量(tokens/秒)
'queue_size': [], # 队列大小
}
def measure_latency(self, func, *args, **kwargs): # func为传入的函数,*args/**kwargs透传任意参数,实现通用性能测量
"""
测量延迟
"""
import time
start = time.time()
result = func(*args, **kwargs) # 将收集的参数解包后原样传给被测量的函数
end = time.time()
latency = end - start
self.metrics['latency'].append(latency)
return result, latency
def calculate_throughput(self, num_tokens, latency):
"""
计算吞吐量
"""
throughput = num_tokens / latency
self.metrics['throughput'].append(throughput)
return throughput
def get_statistics(self):
"""
获取统计信息
"""
import numpy as np
stats = {}
for key, values in self.metrics.items():
if values:
stats[key] = {
'mean': np.mean(values),
'median': np.median(values),
'p95': np.percentile(values, 95),
'p99': np.percentile(values, 99),
'min': np.min(values),
'max': np.max(values),
}
return stats
# 基准测试
def benchmark_inference(model, prompts, max_tokens=100):
"""
推理基准测试
"""
monitor = PerformanceMonitor()
for prompt in prompts:
# 测量延迟
outputs, latency = monitor.measure_latency(
model.generate,
prompt,
max_tokens=max_tokens
)
# 计算吞吐量
num_tokens = len(outputs[0].outputs[0].token_ids)
throughput = monitor.calculate_throughput(num_tokens, latency)
print(f"Prompt: {prompt[:50]}...")
print(f"Latency: {latency:.3f}s")
print(f"Throughput: {throughput:.1f} tokens/s")
print("-" * 50)
# 打印统计
stats = monitor.get_statistics()
print("\n=== Benchmark Results ===")
for metric, values in stats.items():
print(f"\n{metric.upper()}:")
for stat, value in values.items():
print(f" {stat}: {value:.3f}")
6.2 优化建议¶
Text Only
推理优化建议
═══════════════════════════════════════════════════════════════════
1. 批处理优化
├── 使用动态批处理(continuous batching)
├── 设置合适的max_num_seqs
└── 调整max_batch_total_tokens
2. 内存优化
├── 使用量化(INT8/INT4)
├── 启用KV Cache分页(vLLM)
├── 调整gpu_memory_utilization
└── 使用梯度检查点(如果同时训练)
3. 并行优化
├── 张量并行(多卡)
├── 流水线并行(超大模型)
└── 数据并行(多实例)
4. 编译优化
├── 使用Torch.compile(PyTorch 2.0+)
├── 使用TensorRT-LLM
└── 使用ONNX Runtime
5. 网络优化
├── 使用gRPC代替HTTP
├── 启用压缩
└── 使用CDN加速模型下载
═══════════════════════════════════════════════════════════════════
KV Cache 缓存体系与缓存命中率优化¶
📌 为什么单独成章: KV Cache 和缓存命中率直接决定推理成本与延迟,是 2025-2026 年 LLM 推理工程的核心优化战场。本节从原理到工程实践全面覆盖。
7.1 KV Cache 工作原理精解¶
Text Only
自回归生成的重复计算问题
═══════════════════════════════════════════════════════════════════
【无 KV Cache】
生成第 t 个 token 时:
对前 t-1 个 token 重新计算 Q、K、V → 注意力 → 输出
总计算量:O(L²) 其中 L 为序列长度
【有 KV Cache】
┌───────────────────────────────────┐
Prefill 阶段 │ 输入全部 Prompt → 一次性计算 │
(首次填充) │ 所有 token 的 K、V → 存入显存 │
└─────────────────────┬─────────────┘
│ KV Cache
┌─────────────────────▼─────────────┐
Decode 阶段 │ 每步只计算新 token 的 Q │
(逐 token 生成)│ 用缓存的 K、V 做注意力计算 │
│ 新 K、V append 到 Cache 末尾 │
└───────────────────────────────────┘
计算量对比(以 seq_len=2048 为例):
无缓存:2048 步 × O(2048) = O(2048²) ≈ 400 万次矩阵乘
有缓存:Prefill O(2048²) + Decode 2048 × O(1) ≈ 减少 99% Decode 计算
显存代价(Llama-3-70B, FP16):
每个 token:2(K+V) × 80层 × 8头 × 128维 × 2B = 327 KB
4096 tokens:327KB × 4096 ≈ 1.3 GB
batch=64:1.3GB × 64 ≈ 83 GB ← 显存管理成为核心工程挑战
═══════════════════════════════════════════════════════════════════
7.2 Prefix Caching (前缀缓存)¶
7.2.1 原理: KV Cache 的跨请求复用¶
Text Only
传统 KV Cache:仅在单次请求内复用
请求 A:[System Prompt × 1000 tokens][用户问题 A]
请求 B:[System Prompt × 1000 tokens][用户问题 B] ← 重复计算 System Prompt!
Prefix Caching:跨请求复用相同前缀的 KV Cache
请求 A:[System Prompt KV Cache(命中)][用户问题 A 新算]
请求 B:[System Prompt KV Cache(命中)][用户问题 B 新算]
───────────────────────────
↑ 直接从缓存读取,无需重算
命中条件:
Token 序列完全一致(基于 Token ID 做 Hash 匹配)
常见场景:
├── 多用户共享同一 System Prompt(RAG/Agent 系统)
├── 多轮对话(前几轮历史相同)
├── Few-shot 示例复用
└── 批量处理同类文档(文档前缀相同)
理论收益:
System Prompt = 2000 tokens,用户输入 = 100 tokens
无 Prefix Caching:每次 Prefill = 2100 tokens
有 Prefix Caching(命中):每次 Prefill = 100 tokens
节省:2000/2100 ≈ 95% 的 Prefill 计算!
TTFT 降低比例 ≈ 1 - (100/2100) = 95%
7.2.2 实现机制:基于 Block Hash 的前缀树¶
Text Only
PagedAttention + Prefix Caching 结合(vLLM 的实现)
核心数据结构:Radix Tree(基数树)
每个 Block = 16 个连续 token 的 KV Cache
Block 的唯一标识 = SHA256(token_ids in block)
示例(Block Size = 4):
请求 1: [T1 T2 T3 T4 | T5 T6 T7 T8 | T9 T10 T11 T12 | ...用户输入...]
├── Block-A (hash:aa11) 命中 ✓
├── Block-B (hash:bb22) 命中 ✓
├── Block-C (hash:cc33) 命中 ✓
└── Block-D (新建) ← 用户独有部分
请求 2: [T1 T2 T3 T4 | T5 T6 T7 T8 | T9 T10 T11 T12 | ...不同用户输入...]
├── Block-A (hash:aa11) 命中 ✓
├── Block-B (hash:bb22) 命中 ✓
├── Block-C (hash:cc33) 命中 ✓
└── Block-E (新建) ← 不同用户输入
缓存淘汰策略:LRU(Least Recently Used)
当显存不足时,优先淘汰最久未使用的 Block
保留热门 System Prompt 的 Block(高命中率保证)
粒度限制:前缀必须对齐 Block 边界
Block 大小 = 16 tokens,前缀长度 = 1537 tokens
实际命中 = floor(1537/16) × 16 = 1536 tokens(最后一个残缺 Block 不命中)
7.2.3 vLLM 启用 Automatic Prefix Caching¶
Python
from vllm import LLM, SamplingParams
# 启用 Automatic Prefix Caching(APC)
llm = LLM(
model="meta-llama/Llama-3-70B-Instruct",
enable_prefix_caching=True, # 关键参数
gpu_memory_utilization=0.90,
tensor_parallel_size=4,
max_num_seqs=256,
# 缓存预热:启动时预计算固定 System Prompt
# preemption_mode="recompute", # 显存不足时重算(默认)
# preemption_mode="swap", # 显存不足时换出到 CPU
)
# ---- 场景 1:多用户共享 System Prompt ----
SYSTEM_PROMPT = """
You are an expert AI assistant with deep knowledge in...
[此处填充 2000 tokens 的详细系统提示]
"""
def build_messages(user_question: str):
return [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_question},
]
# 第一批请求:System Prompt 被计算并缓存
prompts_batch1 = [
llm.get_tokenizer().apply_chat_template(
build_messages(q), tokenize=False, add_generation_prompt=True
)
for q in ["What is RAG?", "Explain Attention", "How does RLHF work?"]
]
sampling_params = SamplingParams(temperature=0.7, max_tokens=512)
# 首次生成:System Prompt 进行 Prefill 并缓存
outputs1 = llm.generate(prompts_batch1, sampling_params)
# 第二批请求:System Prompt KV Cache 命中,TTFT 大幅下降
prompts_batch2 = [
llm.get_tokenizer().apply_chat_template(
build_messages(q), tokenize=False, add_generation_prompt=True
)
for q in ["What is MoE?", "Explain vLLM", "How does LoRA work?"]
]
# 再次生成:System Prompt 命中缓存,只 Prefill 用户输入部分
outputs2 = llm.generate(prompts_batch2, sampling_params)
# ---- 场景 2:多轮对话的前缀复用 ----
class CachedMultiTurnChat:
"""多轮对话:历史轮次命中 Prefix Cache"""
def __init__(self, llm_engine, system_prompt: str):
self.llm = llm_engine
self.system_prompt = system_prompt
self.history = []
def chat(self, user_input: str) -> str:
self.history.append({"role": "user", "content": user_input})
messages = [{"role": "system", "content": self.system_prompt}] + self.history
prompt = self.llm.get_tokenizer().apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# 每次请求,历史对话前缀命中缓存,只需计算新增 token
output = self.llm.generate(
[prompt],
SamplingParams(temperature=0.7, max_tokens=256)
)
response = output[0].outputs[0].text
self.history.append({"role": "assistant", "content": response})
return response
7.3 缓存命中率:度量、监控与优化目标¶
7.3.1 核心指标定义¶
Text Only
缓存命中率(Cache Hit Rate)
═══════════════════════════════════════════════════════════════════
① Token 级命中率(最常用)
Hit Rate = 命中缓存的 Token 数 / 总 Prefill Token 数
例:System Prompt = 1000 tokens,用户输入 = 100 tokens,全部命中
Hit Rate = 1000 / 1100 = 90.9%
② 请求级命中率
Hit Rate = 发生至少一次命中的请求数 / 总请求数
③ 成本节省率(API 计费场景)
Cost Saving = 命中 Token 节省的费用 / 总原始 Prefill 费用
目标基准(取决于业务场景):
┌────────────────────────────────────┬─────────────────────────────┐
│ 场景 │ 合理目标命中率 │
├────────────────────────────────────┼─────────────────────────────┤
│ 固定 System Prompt + 单次 Q&A │ 80-95% │
│ 多轮对话(10轮以上) │ 70-90%(随轮数增长) │
│ RAG(每次检索结果不同) │ 30-60%(仅 System Prompt) │
│ 代码补全(逐字符变化) │ 5-30%(文件前缀复用) │
│ 批量文档处理(相同模板) │ 90%+ │
└────────────────────────────────────┴─────────────────────────────┘
═══════════════════════════════════════════════════════════════════
7.3.2 vLLM 缓存命中率监控¶
Python
import time
import threading
from collections import defaultdict
from typing import Dict, List, Optional
class PrefixCacheMonitor:
"""
生产级 Prefix Cache 命中率监控
支持实时统计、滑动窗口、告警
"""
def __init__(self, window_size: int = 1000):
self.window_size = window_size
self._lock = threading.Lock()
# 总体统计
self.total_prefill_tokens = 0
self.cache_hit_tokens = 0
self.total_requests = 0
self.hit_requests = 0
# 滑动窗口(最近 N 次请求)
self.window: List[Dict] = []
# 按场景分类统计
self.scene_stats: Dict[str, Dict] = defaultdict(lambda: {
"total_tokens": 0,
"hit_tokens": 0,
"requests": 0,
})
def record(self,
prompt_tokens: int,
cached_tokens: int,
scene: str = "default"):
"""
记录一次请求的缓存情况
Args:
prompt_tokens: 本次 Prefill 的总 token 数
cached_tokens: 其中命中缓存的 token 数(无需重算)
scene: 业务场景标记(用于分场景分析)
"""
with self._lock:
self.total_prefill_tokens += prompt_tokens
self.cache_hit_tokens += cached_tokens
self.total_requests += 1
if cached_tokens > 0:
self.hit_requests += 1
# 场景统计
self.scene_stats[scene]["total_tokens"] += prompt_tokens
self.scene_stats[scene]["hit_tokens"] += cached_tokens
self.scene_stats[scene]["requests"] += 1
# 滑动窗口
entry = {
"timestamp": time.time(),
"prompt_tokens": prompt_tokens,
"cached_tokens": cached_tokens,
"hit_rate": cached_tokens / max(prompt_tokens, 1),
"scene": scene,
}
self.window.append(entry)
if len(self.window) > self.window_size:
self.window.pop(0)
@property
def overall_token_hit_rate(self) -> float:
"""整体 Token 级命中率"""
if self.total_prefill_tokens == 0:
return 0.0
return self.cache_hit_tokens / self.total_prefill_tokens
@property
def overall_request_hit_rate(self) -> float:
"""整体请求级命中率"""
if self.total_requests == 0:
return 0.0
return self.hit_requests / self.total_requests
@property
def recent_token_hit_rate(self) -> float:
"""最近滑动窗口内的 Token 命中率"""
if not self.window:
return 0.0
total = sum(e["prompt_tokens"] for e in self.window)
hit = sum(e["cached_tokens"] for e in self.window)
return hit / max(total, 1)
def compute_cost_saving(self,
price_per_1k_tokens: float = 0.003) -> Dict:
"""
计算节省的 API 费用(适用于 OpenAI / Anthropic 等按 Token 计费场景)
Args:
price_per_1k_tokens: 每千输入 token 的价格(美元)
"""
saved_tokens = self.cache_hit_tokens
original_cost = self.total_prefill_tokens / 1000 * price_per_1k_tokens
saved_cost = saved_tokens / 1000 * price_per_1k_tokens
return {
"total_prefill_tokens": self.total_prefill_tokens,
"saved_tokens": saved_tokens,
"original_cost_usd": round(original_cost, 4),
"saved_cost_usd": round(saved_cost, 4),
"saving_rate": round(saved_cost / max(original_cost, 1e-9), 4),
}
def get_report(self) -> str:
"""输出可读性报告"""
lines = [
"=" * 60,
" Prefix Cache 命中率报告",
"=" * 60,
f" 总请求数 : {self.total_requests:,}",
f" 总 Prefill Token: {self.total_prefill_tokens:,}",
f" 命中 Token 数 : {self.cache_hit_tokens:,}",
f" Token 命中率 : {self.overall_token_hit_rate:.1%}",
f" 请求命中率 : {self.overall_request_hit_rate:.1%}",
f" 近期命中率(滑窗): {self.recent_token_hit_rate:.1%}",
"-" * 60,
" 分场景统计:",
]
for scene, stat in self.scene_stats.items():
scene_rate = stat["hit_tokens"] / max(stat["total_tokens"], 1)
lines.append(
f" [{scene}] 命中率={scene_rate:.1%} "
f"请求={stat['requests']} "
f"hit_tokens={stat['hit_tokens']:,}"
)
lines.append("=" * 60)
return "\n".join(lines)
# ---- 与 vLLM 集成的封装示例 ----
monitor = PrefixCacheMonitor(window_size=2000)
from vllm import LLM, SamplingParams
llm = LLM(
model="meta-llama/Llama-3-8B-Instruct",
enable_prefix_caching=True,
gpu_memory_utilization=0.90,
)
def monitored_generate(prompts: List[str],
scene: str = "default",
**kwargs) -> List:
"""
带监控的推理封装:自动记录缓存命中情况
"""
params = SamplingParams(**kwargs)
outputs = llm.generate(prompts, params)
for output in outputs:
prompt_tokens = len(output.prompt_token_ids)
# vLLM RequestOutput 包含 num_cached_tokens 字段(v0.5+)
cached_tokens = getattr(output, "num_cached_tokens", 0)
monitor.record(prompt_tokens, cached_tokens, scene=scene)
return outputs
# 使用示例
results = monitored_generate(
["What is RAG?", "Explain Attention Mechanism"],
scene="qa",
temperature=0.7,
max_tokens=256,
)
print(monitor.get_report())
7.4 主流框架缓存支持对比¶
Text Only
主流框架 Prefix Caching 支持矩阵(2026 Q1)
═══════════════════════════════════════════════════════════════════
框架 | 自动前缀缓存 | 跨请求复用 | 多租户共享 | 缓存预热 | 命中率指标
───────────────┼────────────┼──────────┼──────────┼────────┼──────────
vLLM | ✅ APC | ✅ | ✅ | ✅ | ✅
SGLang | ✅ RadixAttn| ✅ | ✅ | ✅ | ✅(最详细)
TGI | ✅(v2.0+)| ✅ | 部分 | ❌ | 有限
TensorRT-LLM | ✅(KV Reuse)| ✅ | 部分 | ✅ | ✅
llama.cpp | ✅(--cache-prompt)| ✅ | ❌ | ❌ | ❌
Ollama | ✅(自动) | 单会话 | ❌ | ❌ | ❌
推荐选型:
高并发多租户场景:SGLang(缓存控制最细粒度)
通用生产部署:vLLM(APC + 生态最成熟)
轻量本地:Ollama / llama.cpp(零配置,但无多租户共享)
═══════════════════════════════════════════════════════════════════
7.4.1 SGLang RadixAttention 示例¶
Python
# SGLang 以激进的 KV Cache 复用著称
# pip install sglang[all]
import sglang as sgl
@sgl.function
def multi_turn_qa(s, system_prompt, questions):
"""
SGLang 原生多轮对话:自动识别可复用前缀
"""
s += sgl.system(system_prompt)
answers = []
for q in questions:
s += sgl.user(q)
s += sgl.assistant(sgl.gen("answer", max_new_tokens=256))
answers.append(s["answer"])
return answers
# 初始化运行时(启用 Radix Cache)
runtime = sgl.Runtime(
model_path="meta-llama/Llama-3-8B-Instruct",
tp_size=1,
# RadixAttention 默认开启,无需额外参数
)
sgl.set_default_backend(runtime)
# 批处理:相同 System Prompt → 自动共享前缀 KV Cache
SYSTEM = "You are a helpful coding assistant specializing in Python."
queries = [
["What is a decorator?", "How does yield work?"],
["Explain asyncio", "What is GIL?"],
]
for q_list in queries:
results = multi_turn_qa.run(
system_prompt=SYSTEM,
questions=q_list,
)
print(results["answer"])
runtime.shutdown()
7.5 API 侧 Prompt Caching (节省成本的工程实践)¶
Text Only
主流 API 的 Prompt Caching 支持(2026 Q1)
═══════════════════════════════════════════════════════════════════
供应商 | 功能名称 | 折扣比例 | 触发条件 | 计费粒度
─────────────┼───────────────────┼──────────┼────────────────────┼─────────
Anthropic | Prompt Caching | 输入-90% | 显式 cache_control | 5min TTL
OpenAI | Prompt Caching | 输入-50% | 自动(1024 token+)| 60min TTL
Google Gemini| Context Caching | 输入-75% | 显式 cachedContent | 最短1h TTL
DeepSeek API | Disk Cache | 输入约-90%| 自动 | 按命中量
关键洞察:
- 缓存命中的 token 比未命中便宜 50-90%
- 长 System Prompt(500+ tokens)+ 高频调用 = 显著成本节约
- 缓存有 TTL(生存时间),需要保持调用频率或主动刷新
═══════════════════════════════════════════════════════════════════
Python
import anthropic
import time
from typing import List, Dict
client = anthropic.Anthropic()
# =====================================================================
# Anthropic Prompt Caching 实战
# =====================================================================
LARGE_SYSTEM_PROMPT = """
You are an expert AI assistant with the following detailed knowledge base:
[此处插入 2000+ tokens 的长系统提示、RAG 结果、Few-shot 示例等]
"""
def call_with_cache(user_message: str) -> Dict:
"""
使用 Prompt Caching 的 Anthropic API 调用
"""
response = client.messages.create(
model="claude-opus-4-5",
max_tokens=1024,
system=[
{
"type": "text",
"text": LARGE_SYSTEM_PROMPT,
"cache_control": {"type": "ephemeral"}, # 显式标记缓存点
}
],
messages=[{"role": "user", "content": user_message}],
)
usage = response.usage
return {
"content": response.content[0].text,
"input_tokens": usage.input_tokens,
"cache_creation_tokens": getattr(usage, "cache_creation_input_tokens", 0),
"cache_read_tokens": getattr(usage, "cache_read_input_tokens", 0),
}
def demo_cache_savings():
"""演示缓存带来的成本节约"""
questions = [
"Summarize the key points",
"What are the main challenges?",
"Suggest three improvements",
"Compare with alternative approaches",
]
total_input = 0
total_cache_read = 0
total_cache_create = 0
for i, q in enumerate(questions):
result = call_with_cache(q)
total_input += result["input_tokens"]
total_cache_read += result["cache_read_tokens"]
total_cache_create += result["cache_creation_tokens"]
status = "CACHE HIT" if result["cache_read_tokens"] > 0 else "CACHE MISS"
print(f"请求 {i+1}: [{status}] "
f"input={result['input_tokens']} "
f"cache_read={result['cache_read_tokens']} "
f"cache_create={result['cache_creation_tokens']}")
# 成本计算(claude-opus-4-5 定价)
PRICE_INPUT = 15.0 / 1_000_000 # $15/M tokens
PRICE_CACHE_CREATE = 18.75 / 1_000_000 # $18.75/M tokens(创建缓存溢价)
PRICE_CACHE_READ = 1.5 / 1_000_000 # $1.5/M tokens(缓存读取 -90%)
actual_cost = (
total_input * PRICE_INPUT
+ total_cache_create * PRICE_CACHE_CREATE
+ total_cache_read * PRICE_CACHE_READ
)
no_cache_cost = (total_input + total_cache_read + total_cache_create) * PRICE_INPUT
print(f"\n总计:")
print(f" 实际成本(有缓存): ${actual_cost:.6f}")
print(f" 理论成本(无缓存): ${no_cache_cost:.6f}")
print(f" 节省比例 : {1 - actual_cost/no_cache_cost:.1%}")
demo_cache_savings()
# =====================================================================
# OpenAI Prompt Caching(自动触发,无需显式标记)
# =====================================================================
from openai import OpenAI
oai_client = OpenAI()
def call_openai_with_cache(system_prompt: str, user_message: str) -> Dict:
"""
OpenAI 自动 Prefix Caching:
- Prompt 的前缀(1024 token 对齐)超过 1024 tokens 自动触发
- 无需任何 API 参数修改
"""
response = oai_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
],
)
usage = response.usage
prompt_details = getattr(usage, "prompt_tokens_details", None)
cached_tokens = getattr(prompt_details, "cached_tokens", 0) if prompt_details else 0
token_hit_rate = cached_tokens / max(usage.prompt_tokens, 1)
return {
"content": response.choices[0].message.content,
"prompt_tokens": usage.prompt_tokens,
"cached_tokens": cached_tokens,
"token_hit_rate": token_hit_rate,
}
7.6 语义缓存( Semantic Cache )¶
Text Only
语义缓存 vs Prefix Caching
═══════════════════════════════════════════════════════════════════
Prefix Caching(Token 精确匹配):
命中条件:Token ID 完全一致
适用:固定模板、System Prompt 复用
局限:"What is AI?" 和 "What's AI?" ≠ 命中
语义缓存(Embedding 相似度匹配):
命中条件:语义相似度 > 阈值(通常 0.92-0.97)
适用:FAQ、重复性问答、知识库查询
收益:可缓存完整的 LLM 响应,命中时完全跳过推理
代价:需要额外 Embedding 推理 + 向量检索延迟
何时用语义缓存:
✅ 高频重复性问答(客服、FAQ)
✅ 知识相对固定(不需要实时信息)
✅ 推理成本高(大模型 / 长输出)
❌ 需要实时/个性化响应
❌ 安全敏感场景(缓存攻击风险)
═══════════════════════════════════════════════════════════════════
Python
import hashlib
import json
import time
from typing import Optional, Tuple
import numpy as np
# 依赖:pip install redis sentence-transformers numpy
class SemanticCache:
"""
生产级语义缓存:
- 精确哈希缓存(L1)+ 语义相似度缓存(L2)
- L1 命中:无需 Embedding,亚毫秒响应
- L2 命中:Embedding 相似度匹配,跳过 LLM 推理
- 未命中:调用 LLM,结果写入双层缓存
"""
def __init__(
self,
redis_url: str = "redis://localhost:6379",
similarity_threshold: float = 0.93,
ttl_seconds: int = 3600,
embedding_model: str = "all-MiniLM-L6-v2",
max_cache_size: int = 10_000,
):
import redis
from sentence_transformers import SentenceTransformer
self.redis = redis.from_url(redis_url, decode_responses=True)
self.threshold = similarity_threshold
self.ttl = ttl_seconds
self.max_size = max_cache_size
self.embedder = SentenceTransformer(embedding_model)
# 内存向量索引(小规模)
# 生产建议换 Faiss / Milvus / Qdrant
self.vectors: list = [] # [(embedding, cache_key)]
# 监控计数器
self.stats = {"l1_hits": 0, "l2_hits": 0, "misses": 0}
def _exact_key(self, query: str) -> str:
"""L1 精确哈希键"""
return "sem_cache:exact:" + hashlib.sha256(query.encode()).hexdigest()
def _semantic_key(self, cache_key: str) -> str:
"""L2 语义缓存的 Redis 键"""
return "sem_cache:semantic:" + cache_key
def _embed(self, text: str) -> np.ndarray:
"""计算归一化 Embedding"""
vec = self.embedder.encode(text, normalize_embeddings=True)
return vec
def _find_similar(self, query_vec: np.ndarray) -> Optional[Tuple[float, str]]:
"""
在内存向量库中找最相似的缓存项
返回 (similarity, cache_key) 或 None
"""
if not self.vectors:
return None
vecs = np.array([v for v, _ in self.vectors]) # (N, dim)
keys = [k for _, k in self.vectors]
# 余弦相似度(已归一化 → 直接点积)
sims = vecs @ query_vec
best_idx = int(np.argmax(sims))
best_sim = float(sims[best_idx])
if best_sim >= self.threshold:
return best_sim, keys[best_idx]
return None
def get(self, query: str) -> Optional[Dict]:
"""
查询缓存
返回 {"response": str, "cache_level": "L1"/"L2", "similarity": float}
或 None(未命中)
"""
# L1:精确哈希命中
exact_key = self._exact_key(query)
cached = self.redis.get(exact_key)
if cached:
self.stats["l1_hits"] += 1
return {"response": cached, "cache_level": "L1", "similarity": 1.0}
# L2:语义相似度命中
query_vec = self._embed(query)
result = self._find_similar(query_vec)
if result:
sim, sem_key = result
response = self.redis.get(self._semantic_key(sem_key))
if response:
self.stats["l2_hits"] += 1
return {"response": response, "cache_level": "L2", "similarity": sim}
self.stats["misses"] += 1
return None
def set(self, query: str, response: str):
"""写入缓存(同时写 L1 和 L2)"""
# L1 精确缓存
exact_key = self._exact_key(query)
self.redis.setex(exact_key, self.ttl, response)
# L2 语义缓存
query_vec = self._embed(query)
cache_key = hashlib.sha256(query.encode()).hexdigest()[:16]
self.redis.setex(self._semantic_key(cache_key), self.ttl, response)
# 更新向量索引(LRU 管理大小)
if len(self.vectors) >= self.max_size:
self.vectors.pop(0) # 简单 FIFO,生产用 LRU
self.vectors.append((query_vec, cache_key))
@property
def hit_rate(self) -> float:
total = sum(self.stats.values())
if total == 0:
return 0.0
return (self.stats["l1_hits"] + self.stats["l2_hits"]) / total
def get_stats(self) -> Dict:
total = sum(self.stats.values())
return {
"l1_hits": self.stats["l1_hits"],
"l2_hits": self.stats["l2_hits"],
"misses": self.stats["misses"],
"total_requests": total,
"hit_rate": f"{self.hit_rate:.1%}",
"l1_hit_rate": f"{self.stats['l1_hits']/max(total,1):.1%}",
"l2_hit_rate": f"{self.stats['l2_hits']/max(total,1):.1%}",
}
# ---- 集成 LLM 调用的完整示例 ----
from openai import OpenAI
sem_cache = SemanticCache(similarity_threshold=0.93, ttl_seconds=7200)
oai = OpenAI()
def cached_llm_call(question: str, system_prompt: str = "") -> Dict:
"""
带语义缓存的 LLM 调用
- 命中:直接返回缓存,延迟 < 10ms
- 未命中:调用 LLM,写入缓存
"""
t0 = time.time()
cache_result = sem_cache.get(question)
if cache_result:
return {
"answer": cache_result["response"],
"source": f"cache({cache_result['cache_level']})",
"similarity": cache_result["similarity"],
"latency_ms": (time.time() - t0) * 1000,
}
# 未命中 → 调用 LLM
response = oai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": question},
],
)
answer = response.choices[0].message.content
# 写入缓存
sem_cache.set(question, answer)
return {
"answer": answer,
"source": "llm",
"similarity": None,
"latency_ms": (time.time() - t0) * 1000,
}
# 测试
questions = [
"What is machine learning?",
"What is ML?", # 语义相似 → L2 命中
"Explain machine learning", # 语义相似 → L2 命中
"What is machine learning?", # 精确匹配 → L1 命中
"How does deep learning work?", # 新问题 → 未命中
]
for q in questions:
result = cached_llm_call(q)
print(f"[{result['source']:12}] {q[:40]:40} | {result['latency_ms']:.1f}ms")
print("\n缓存统计:", json.dumps(sem_cache.get_stats(), indent=2, ensure_ascii=False))
7.7 缓存命中率优化最佳实践¶
Text Only
缓存命中率优化策略全景
═══════════════════════════════════════════════════════════════════
① 结构设计:固定前缀前置
✅ 推荐:[System Prompt(固定)][Few-shot(固定)][用户输入(变化)]
❌ 不推荐:[时间戳][用户输入][System Prompt] ← 前缀每次变化
原则:将不变部分放在上下文最前面
效果:命中率 0% → 80%+(视 System Prompt 占比)
② 内容规范化
- 统一空白符、标点、大小写(避免同义字符串不命中)
- 日期/时间等动态内容放在用户输入侧,不放 System Prompt
- 避免在 System Prompt 中插入请求 ID / trace_id 等唯一标识
③ 缓存预热(Cache Warm-up)
服务启动时主动发送一条含完整 System Prompt 的虚拟请求
→ 第一批真实用户请求直接命中缓存,避免冷启动延迟
# vLLM 缓存预热示例
def warm_up_cache(llm, system_prompt: str):
warmup_msg = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": "Hello"},
]
prompt = llm.get_tokenizer().apply_chat_template(
warmup_msg, tokenize=False, add_generation_prompt=True
)
llm.generate([prompt], SamplingParams(max_tokens=1))
print("[Cache] Warm-up complete")
④ 多租户 System Prompt 共享
- 同一业务场景下所有用户共享完全相同的 System Prompt
- 避免在 System Prompt 中插入用户 ID / 个性化内容
- 个性化内容放在用户输入(第一轮消息)中
⑤ Few-shot 示例的缓存友好写法
- Few-shot 示例放在 System Prompt 末尾(固定部分)
- 示例内容不依赖运行时数据(静态知识 OK,动态检索结果不 OK)
⑥ RAG 场景的 Prefix Caching 策略
挑战:每次检索结果不同 → 整个 Prompt 唯一 → 命中率极低
优化方案 1:两段式 Prompt
[固定 System Prompt(缓存)][Query: ...][\n检索结果:\n{docs}]
→ System Prompt 部分仍可命中
优化方案 2:文档 Prefix Caching
针对同类文档批量处理,将文档内容放在前缀
[System Prompt][文档内容(同一文档多个问题)][问题]
→ 同文档不同问题时命中率接近 100%
⑦ 长对话的缓存管理
- 保持对话历史结构不变(追加而非重排)
- 超出上下文窗口时:使用摘要压缩,但保留原始历史作为前缀
- 避免在历史中修改已有消息(会使后续 Block Hash 失效)
⑧ 监控告警阈值
场景 | 告警阈值(Token 命中率)
固定 System Prompt 场景 | < 70% → 排查前缀结构
多轮对话场景 | < 50% → 排查历史清空策略
批量处理场景 | < 85% → 排查模板一致性
═══════════════════════════════════════════════════════════════════
7.8 端到端性能对比实验¶
Python
import time
import statistics
from typing import List
from vllm import LLM, SamplingParams
def benchmark_prefix_caching(
model_name: str,
system_prompt_tokens: int,
user_input: str,
num_requests: int = 50,
) -> None:
"""
对比启用/禁用 Prefix Caching 下的 TTFT 和吞吐量
"""
# 构造足够长的 System Prompt
system_prompt = "You are a helpful assistant. " * (system_prompt_tokens // 6)
results = {}
for cache_enabled in [False, True]:
llm = LLM(
model=model_name,
enable_prefix_caching=cache_enabled,
gpu_memory_utilization=0.85,
)
tokenizer = llm.get_tokenizer()
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_input},
]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
params = SamplingParams(max_tokens=128, temperature=0.0)
# 预热(仅在启用缓存时,首次填充缓存)
if cache_enabled:
llm.generate([prompt], params)
latencies = []
t_total_start = time.time()
for _ in range(num_requests):
t0 = time.time()
llm.generate([prompt], params)
latencies.append(time.time() - t0)
total_time = time.time() - t_total_start
label = "Prefix Cache ON " if cache_enabled else "Prefix Cache OFF"
results[label] = {
"mean_latency_ms": statistics.mean(latencies) * 1000,
"p50_latency_ms": statistics.median(latencies) * 1000,
"p95_latency_ms": sorted(latencies)[int(num_requests * 0.95)] * 1000,
"throughput_rps": num_requests / total_time,
}
del llm # 释放显存
# 输出对比
print(f"\n{'='*65}")
print(f" Benchmark: System Prompt ≈ {system_prompt_tokens} tokens")
print(f" 模型: {model_name} | 请求数: {num_requests}")
print(f"{'='*65}")
print(f"{'指标':<20} {'无缓存':>18} {'有缓存':>18} {'提升':>12}")
print(f"{'-'*65}")
no_cache = results["Prefix Cache OFF"]
has_cache = results["Prefix Cache ON "]
for key, label in [
("mean_latency_ms", "平均延迟 (ms)"),
("p50_latency_ms", "P50 延迟 (ms)"),
("p95_latency_ms", "P95 延迟 (ms)"),
("throughput_rps", "吞吐量 (req/s)"),
]:
v_no = no_cache[key]
v_yes = has_cache[key]
if key == "throughput_rps":
improvement = f"+{(v_yes/v_no - 1)*100:.1f}%"
else:
improvement = f"-{(1 - v_yes/v_no)*100:.1f}%"
print(f" {label:<18} {v_no:>15.1f} {v_yes:>15.1f} {improvement:>10}")
print(f"{'='*65}\n")
# 运行基准测试(需要 GPU 环境)
# benchmark_prefix_caching(
# model_name="meta-llama/Llama-3-8B-Instruct",
# system_prompt_tokens=1000,
# user_input="What is the capital of France?",
# num_requests=50,
# )
总结¶
部署方案选择指南¶
| 场景 | 推荐方案 | 理由 |
|---|---|---|
| 本地开发 | Transformers / llama.cpp | 简单易用 |
| 生产 API | vLLM / TGI | 高性能,稳定 |
| 超大模型 | vLLM + 张量并行 | 支持多卡 |
| 边缘设备 | llama.cpp / MLC-LLM | 极致量化 |
| 快速原型 | HuggingFace Inference API | 无需部署 |
| 高频重复 Q&A | 语义缓存 + vLLM APC | 成本最低 |
部署检查清单¶
- 模型格式正确( Safetensors 推荐)
- 量化配置适当
- 批处理参数调优
- 内存使用监控
- 错误处理机制
- 日志记录完善
- 健康检查接口
- 自动扩缩容策略
缓存体系检查清单¶
-
enable_prefix_caching=True( vLLM )或对应框架缓存参数已开启 - System Prompt 固定不变、位于 Prompt 最前端
- 服务启动后执行缓存预热请求
- 缓存命中率监控接入 Prometheus / Grafana
- Token 命中率告警阈值已配置(建议 > 70%)
- API 侧 Prompt Caching ( Anthropic
cache_control/ OpenAI 自动)已验证生效 - 高频 FAQ 场景已接入语义缓存( Semantic Cache )
- 多轮对话历史仅追加,不修改历史消息
最后更新日期: 2026-02-12 适用版本: LLM 学习教程 v2026
下一步:学习04-对齐技术,掌握 RLHF 和 DPO 等对齐技术!