Milvus — 分布式向量数据库
简介
Milvus 是专为生产环境设计的分布式向量数据库,支持十亿级向量、多种索引类型、水平扩展,适合金融级生产部署。
bash
pip install pymilvusDocker 部署
bash
# 单机版(Milvus Standalone)
wget https://github.com/milvus-io/milvus/releases/download/v2.4.0/milvus-standalone-docker-compose.yml
docker compose -f milvus-standalone-docker-compose.yml up -d基础操作
python
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
import numpy as np
# 连接
connections.connect("default", host="localhost", port="19530")
# 定义 Schema
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=2000),
FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=512)
]
schema = CollectionSchema(fields, description="金融知识库")
collection = Collection("finance_kb", schema)
# 创建索引
index_params = {
"metric_type": "COSINE",
"index_type": "IVF_FLAT",
"params": {"nlist": 128}
}
collection.create_index("embedding", index_params)
# 插入数据
data = [
["不良贷款率是衡量银行资产质量的核心指标", "风控手册"],
["资本充足率反映银行抵御风险的能力", "监管文件"]
]
embeddings = np.random.randn(2, 512).astype(np.float32)
collection.insert([
[row[0] for row in data], # content
[row[1] for row in data], # source
embeddings.tolist() # embedding
])
collection.flush()
# 加载到内存
collection.load()
# 搜索
query_embedding = np.random.randn(1, 512).astype(np.float32)
results = collection.search(
data=query_embedding.tolist(),
anns_field="embedding",
param={"metric_type": "COSINE", "params": {"nprobe": 10}},
limit=3,
output_fields=["content", "source"]
)
for hit in results[0]:
print(f"相似度: {hit.score:.3f} | {hit.entity.get('content')}")与 LangChain 集成
python
from langchain_community.vectorstores import Milvus
vectorstore = Milvus.from_documents(
documents=chunks,
embedding=embeddings,
connection_args={"host": "localhost", "port": "19530"},
collection_name="finance_kb"
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})