Embedding 原理与实践
什么是 Embedding
Embedding 将文本映射为高维向量空间中的点,语义相似的文本在向量空间中距离更近。
python
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer("BAAI/bge-small-zh-v1.5")
sentences = [
"招商银行不良贷款率为0.95%",
"CMB non-performing loan ratio is 0.95%",
"今天天气很好",
]
embeddings = model.encode(sentences, normalize_embeddings=True)
# 计算余弦相似度
def cosine_sim(a, b):
return np.dot(a, b) # 已归一化,点积即余弦相似度
print(f"中英文相似度: {cosine_sim(embeddings[0], embeddings[1]):.3f}") # ~0.85
print(f"无关句子相似度: {cosine_sim(embeddings[0], embeddings[2]):.3f}") # ~0.21
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
BGE 系列(推荐)
python
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
# BGE 需要添加查询前缀
embeddings = HuggingFaceBgeEmbeddings(
model_name="BAAI/bge-large-zh-v1.5",
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True},
query_instruction="为这个句子生成表示以用于检索相关文章:"
)
# 查询时自动添加前缀
query_embedding = embeddings.embed_query("不良贷款率是什么")
doc_embedding = embeddings.embed_documents(["不良贷款率的定义..."])1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
API Embedding(无需本地模型)
python
from openai import OpenAI
client = OpenAI(
api_key="sk-xxx",
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
def get_embedding(text: str) -> list[float]:
response = client.embeddings.create(
model="text-embedding-v3", # 千问 Embedding
input=text,
dimensions=512 # 可选维度
)
return response.data[0].embedding
embedding = get_embedding("招商银行2024年年报")
print(f"向量维度: {len(embedding)}")1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
批量 Embedding 优化
python
from typing import List
import torch
from transformers import AutoTokenizer, AutoModel
class EfficientEmbedder:
def __init__(self, model_name: str = "BAAI/bge-small-zh-v1.5"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.model.eval()
@torch.no_grad()
def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
encoded = self.tokenizer(
batch,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
)
outputs = self.model(**encoded)
# CLS token 作为句子表示
embeddings = outputs.last_hidden_state[:, 0, :]
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
all_embeddings.append(embeddings.numpy())
return np.vstack(all_embeddings)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31