HuggingFace Transformers
简介
HuggingFace Transformers 是加载、使用、微调预训练 LLM 的标准库,提供统一接口访问数千个开源模型。
bash
pip install transformers datasets accelerate sentencepiecePipeline — 最简单的推理
python
from transformers import pipeline
# 文本生成
generator = pipeline(
"text-generation",
model="Qwen/Qwen2-1.5B-Instruct",
device_map="auto",
torch_dtype="auto"
)
result = generator(
"分析小微企业贷款的主要风险:",
max_new_tokens=200,
temperature=0.7,
do_sample=True
)
print(result[0]["generated_text"])
# 文本分类(情感分析)
classifier = pipeline(
"text-classification",
model="uer/roberta-base-finetuned-jd-binary-chinese"
)
result = classifier("这家银行服务态度很好,贷款流程简单")
print(result) # [{'label': 'positive', 'score': 0.98}]加载模型与 Tokenizer
python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_name = "Qwen/Qwen2-7B-Instruct"
# 加载 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
# 加载模型(自动分配 GPU)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
# 对话推理
messages = [
{"role": "system", "content": "你是专业的金融分析师"},
{"role": "user", "content": "解释什么是不良贷款率"}
]
# 应用对话模板
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# 解码输出(去掉输入部分)
generated = outputs[0][inputs.input_ids.shape[1]:]
response = tokenizer.decode(generated, skip_special_tokens=True)
print(response)流式生成
python
from transformers import TextStreamer, TextIteratorStreamer
from threading import Thread
# 实时打印流式输出
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
model.generate(
**inputs,
max_new_tokens=512,
streamer=streamer
)
# 异步流式(用于 API 服务)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {
**inputs,
"max_new_tokens": 512,
"streamer": streamer
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for token in streamer:
print(token, end="", flush=True)量化加载(节省显存)
python
from transformers import BitsAndBytesConfig
# INT4 量化(约节省 75% 显存)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto"
)
# 显存需求对比(7B 模型)
# float32: ~28GB
# float16: ~14GB
# int8: ~7GB
# int4: ~4GB本地模型下载
bash
# 使用 huggingface-cli
pip install huggingface_hub
# 设置镜像(国内加速)
export HF_ENDPOINT=https://hf-mirror.com
# 下载模型
huggingface-cli download Qwen/Qwen2-7B-Instruct --local-dir ./models/qwen2-7bpython
# Python 下载
from huggingface_hub import snapshot_download
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
snapshot_download(
repo_id="BAAI/bge-small-zh-v1.5",
local_dir="./models/bge-small-zh",
ignore_patterns=["*.msgpack", "*.h5"]
)显存规划
| 模型规模 | float16 | int8 | int4 |
|---|---|---|---|
| 1.5B | 3GB | 1.5GB | 1GB |
| 7B | 14GB | 7GB | 4GB |
| 13B | 26GB | 13GB | 7GB |
| 70B | 140GB | 70GB | 35GB |