模型服务化最佳实践
vLLM 高吞吐推理服务
bash
pip install vllmpython
from vllm import LLM, SamplingParams
# 加载模型(自动使用 PagedAttention 优化显存)
llm = LLM(
model="Qwen/Qwen2-7B-Instruct",
tensor_parallel_size=1, # GPU 数量
gpu_memory_utilization=0.9,
max_model_len=8192
)
# 批量推理(高吞吐)
prompts = [
"分析企业A的贷款风险",
"分析企业B的贷款风险",
"分析企业C的贷款风险",
]
sampling_params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(output.outputs[0].text)vLLM OpenAI 兼容服务
bash
# 启动 vLLM 服务(兼容 OpenAI API)
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2-7B-Instruct \
--host 0.0.0.0 \
--port 8080 \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9python
# 客户端调用(与 OpenAI SDK 完全兼容)
from openai import OpenAI
client = OpenAI(api_key="vllm", base_url="http://localhost:8080/v1")
response = client.chat.completions.create(
model="Qwen/Qwen2-7B-Instruct",
messages=[{"role": "user", "content": "你好"}]
)性能监控
python
from fastapi import FastAPI
import time
import prometheus_client as prom
# Prometheus 指标
REQUEST_COUNT = prom.Counter("llm_requests_total", "Total LLM requests", ["model", "status"])
REQUEST_LATENCY = prom.Histogram("llm_request_duration_seconds", "LLM request latency", ["model"])
TOKEN_COUNT = prom.Counter("llm_tokens_total", "Total tokens used", ["model", "type"])
@app.middleware("http")
async def metrics_middleware(request, call_next):
start = time.time()
response = await call_next(request)
duration = time.time() - start
if request.url.path == "/chat":
REQUEST_LATENCY.labels(model="qwen-turbo").observe(duration)
return response
@app.get("/metrics")
async def metrics():
return prom.generate_latest()负载均衡多实例
python
import random
from openai import OpenAI
# 多个 LLM 服务实例
LLM_ENDPOINTS = [
"http://llm-api-1:8000/v1",
"http://llm-api-2:8000/v1",
"http://llm-api-3:8000/v1",
]
def get_client() -> OpenAI:
"""随机负载均衡"""
endpoint = random.choice(LLM_ENDPOINTS)
return OpenAI(api_key="sk-xxx", base_url=endpoint)