Chroma — 轻量级向量数据库
简介
Chroma 是专为 LLM 应用设计的开源向量数据库,支持本地嵌入式运行和客户端-服务器模式,是 RAG 原型开发的首选。
bash
pip install chromadb本地使用
python
import chromadb
from chromadb.utils import embedding_functions
# 本地持久化
client = chromadb.PersistentClient(path="./chroma_db")
# 使用 HuggingFace Embedding
hf_ef = embedding_functions.HuggingFaceEmbeddingFunction(
api_key="hf-xxx",
model_name="BAAI/bge-small-zh-v1.5"
)
# 创建集合
collection = client.get_or_create_collection(
name="finance_docs",
embedding_function=hf_ef,
metadata={"hnsw:space": "cosine"} # 余弦相似度
)
# 添加文档
collection.add(
documents=[
"不良贷款率是衡量银行资产质量的核心指标",
"资本充足率反映银行抵御风险的能力",
"净息差是银行盈利能力的重要指标"
],
metadatas=[
{"source": "风控手册", "category": "指标"},
{"source": "监管文件", "category": "指标"},
{"source": "分析报告", "category": "指标"}
],
ids=["doc1", "doc2", "doc3"]
)
# 查询
results = collection.query(
query_texts=["银行贷款质量怎么评估"],
n_results=2,
where={"category": "指标"} # 元数据过滤
)
for doc, distance in zip(results["documents"][0], results["distances"][0]):
print(f"相似度: {1-distance:.3f} | {doc}")服务器模式(生产部署)
bash
# 启动 Chroma 服务器
chroma run --host 0.0.0.0 --port 8001 --path ./chroma_datapython
# 客户端连接
client = chromadb.HttpClient(host="localhost", port=8001)与 LangChain 集成
python
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
# 加载文档
loader = DirectoryLoader("./docs", glob="**/*.txt")
documents = loader.load()
# 分块
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)
# 构建向量库
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-zh-v1.5")
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory="./chroma_db",
collection_name="finance_kb"
)
# 检索
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
docs = retriever.invoke("不良贷款率的计算方法")批量更新
python
# 增量添加
vectorstore.add_documents(new_documents)
# 删除文档
vectorstore.delete(ids=["doc1", "doc2"])
# 查看统计
print(f"文档数量: {vectorstore._collection.count()}")