Skip to content

HuggingFace Transformers

简介

HuggingFace Transformers 是加载、使用、微调预训练 LLM 的标准库,提供统一接口访问数千个开源模型。

bash
pip install transformers datasets accelerate sentencepiece

Pipeline — 最简单的推理

python
from transformers import pipeline

# 文本生成
generator = pipeline(
    "text-generation",
    model="Qwen/Qwen2-1.5B-Instruct",
    device_map="auto",
    torch_dtype="auto"
)

result = generator(
    "分析小微企业贷款的主要风险:",
    max_new_tokens=200,
    temperature=0.7,
    do_sample=True
)
print(result[0]["generated_text"])

# 文本分类(情感分析)
classifier = pipeline(
    "text-classification",
    model="uer/roberta-base-finetuned-jd-binary-chinese"
)
result = classifier("这家银行服务态度很好,贷款流程简单")
print(result)  # [{'label': 'positive', 'score': 0.98}]

加载模型与 Tokenizer

python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen2-7B-Instruct"

# 加载 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

# 加载模型(自动分配 GPU)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 对话推理
messages = [
    {"role": "system", "content": "你是专业的金融分析师"},
    {"role": "user", "content": "解释什么是不良贷款率"}
]

# 应用对话模板
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = tokenizer([text], return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

# 解码输出(去掉输入部分)
generated = outputs[0][inputs.input_ids.shape[1]:]
response = tokenizer.decode(generated, skip_special_tokens=True)
print(response)

流式生成

python
from transformers import TextStreamer, TextIteratorStreamer
from threading import Thread

# 实时打印流式输出
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

model.generate(
    **inputs,
    max_new_tokens=512,
    streamer=streamer
)

# 异步流式(用于 API 服务)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

generation_kwargs = {
    **inputs,
    "max_new_tokens": 512,
    "streamer": streamer
}

thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

for token in streamer:
    print(token, end="", flush=True)

量化加载(节省显存)

python
from transformers import BitsAndBytesConfig

# INT4 量化(约节省 75% 显存)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# 显存需求对比(7B 模型)
# float32: ~28GB
# float16: ~14GB
# int8:    ~7GB
# int4:    ~4GB

本地模型下载

bash
# 使用 huggingface-cli
pip install huggingface_hub

# 设置镜像(国内加速)
export HF_ENDPOINT=https://hf-mirror.com

# 下载模型
huggingface-cli download Qwen/Qwen2-7B-Instruct --local-dir ./models/qwen2-7b
python
# Python 下载
from huggingface_hub import snapshot_download
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

snapshot_download(
    repo_id="BAAI/bge-small-zh-v1.5",
    local_dir="./models/bge-small-zh",
    ignore_patterns=["*.msgpack", "*.h5"]
)

显存规划

模型规模float16int8int4
1.5B3GB1.5GB1GB
7B14GB7GB4GB
13B26GB13GB7GB
70B140GB70GB35GB

本站内容由 褚成志 整理编写,仅供学习参考