Skip to content

Embedding 原理与实践

什么是 Embedding

Embedding 将文本映射为高维向量空间中的点,语义相似的文本在向量空间中距离更近。

python
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("BAAI/bge-small-zh-v1.5")

sentences = [
    "招商银行不良贷款率为0.95%",
    "CMB non-performing loan ratio is 0.95%",
    "今天天气很好",
]

embeddings = model.encode(sentences, normalize_embeddings=True)

# 计算余弦相似度
def cosine_sim(a, b):
    return np.dot(a, b)  # 已归一化,点积即余弦相似度

print(f"中英文相似度: {cosine_sim(embeddings[0], embeddings[1]):.3f}")  # ~0.85
print(f"无关句子相似度: {cosine_sim(embeddings[0], embeddings[2]):.3f}")  # ~0.2

BGE 系列(推荐)

python
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

# BGE 需要添加查询前缀
embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-large-zh-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
    query_instruction="为这个句子生成表示以用于检索相关文章:"
)

# 查询时自动添加前缀
query_embedding = embeddings.embed_query("不良贷款率是什么")
doc_embedding = embeddings.embed_documents(["不良贷款率的定义..."])

API Embedding(无需本地模型)

python
from openai import OpenAI

client = OpenAI(
    api_key="sk-xxx",
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)

def get_embedding(text: str) -> list[float]:
    response = client.embeddings.create(
        model="text-embedding-v3",  # 千问 Embedding
        input=text,
        dimensions=512  # 可选维度
    )
    return response.data[0].embedding

embedding = get_embedding("招商银行2024年年报")
print(f"向量维度: {len(embedding)}")

批量 Embedding 优化

python
from typing import List
import torch
from transformers import AutoTokenizer, AutoModel

class EfficientEmbedder:
    def __init__(self, model_name: str = "BAAI/bge-small-zh-v1.5"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()
    
    @torch.no_grad()
    def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        all_embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encoded = self.tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            )
            
            outputs = self.model(**encoded)
            # CLS token 作为句子表示
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
            all_embeddings.append(embeddings.numpy())
        
        return np.vstack(all_embeddings)

本站内容由 褚成志 整理编写,仅供学习参考