feat: rag

This commit is contained in:
heyong.fu
2026-05-06 11:35:10 +08:00
commit a17c65c4bc
75 changed files with 5196 additions and 0 deletions
+31
View File
@@ -0,0 +1,31 @@
# 创建临时客户端
import chromadb
# 创建一个临时的内存客户端(不会保存到硬盘)
client = chromadb.EphemeralClient()
# 创建一个集合
collection = client.create_collection(name="test")
# 添加一条数据
collection.add(
documents=["今天天气有风", "很冷", "注意保暖", "加油学习"],
ids=["test_1", "test_2", "test_3", "test_4"],
)
# 查询数据
results = collection.query(query_texts=["天气"], n_results=2)
print(f"打印数据结果{results}")
# {
# 'ids': [['test_1', 'test_2']],
# 'embeddings': None,
# 'documents': [['今天天气有风', '很冷']],
# 'uris': None,
# 'included': ['metadatas', 'documents', 'distances'],
# 'data': None,
# 'metadatas': [[None, None]],
# 'distances': [[0.2988046705722809, 0.9478188753128052]]
# }
+22
View File
@@ -0,0 +1,22 @@
# 持久化存储
import chromadb
# 持久化客户端
# path指定数据存储的路径
# 如果目录不存在,Chromadb会自动创建
persistent_client = chromadb.PersistentClient(path="./chromadb_store")
# 创建一个集合(类似创建一个表)
collection = persistent_client.create_collection(
name="notes", metadata={"description": "笔记集合"} # 集合名称 # 集合元数据
)
# 列出所有集合,确认创建成功
# list_collections() 返回所有集合的列表
collections = persistent_client.list_collections()
print(collections)
for col in collections:
print(f"-{col.name}")
+23
View File
@@ -0,0 +1,23 @@
# 获取已经存在的集合
# 如果集合已经存在,可以使用get_collection() 或者 get_or_create_collection() 方法
import chromadb
# 创建持久化客户端
client = chromadb.PersistentClient(path="./chromadb_store")
# 方法1:获取已存在的集合
try:
existring_collection = client.get_collection(name="notes")
print("集合已经存在", existring_collection.name)
except Exception as e:
print("集合不存在", e)
# 方法2:获取或者创建集合(推荐使用)
collection = client.get_or_create_collection(
name="notes", metadata={"description": "笔记集合"}
)
print(collection.name)
+42
View File
@@ -0,0 +1,42 @@
# 写入数据
import chromadb
# 创建持久化客户端
client = chromadb.PersistentClient(path="./chromadb_store")
# 创建集合
collection = client.get_or_create_collection(name="knowledge_base")
# 准备说明文档
documents = [
"机器学习包含监督学习和无监督学习",
"Python 拥有丰富的数据科学生态",
"数据库可以持久化结构化或非结构化数据",
]
# 准备元组数据
metadatas = [
{"topic": "ml", "level": "intro"},
{"topic": "python", "level": "beginner"},
{"topic": "database", "level": "intro"},
]
# 准备唯一标识
# ids 是一个列表,每个元素对应一个文档的唯一ID
# 如果不提供,Chromedb会自动生成
ids = ["doc_1", "doc_2", "doc_3"]
# 将数据添加到集合中
# add() 方法会将文档转为向量
collection.add(documents=documents, metadatas=metadatas, ids=ids)
# 获取集合列表
collections = client.list_collections()
print(collections)
# 查看集合中的文档
doc_count = collection.count()
print(doc_count)
+45
View File
@@ -0,0 +1,45 @@
# 查询数据
import chromadb
# 创建持久化客户端
client = chromadb.PersistentClient(path="./chromadb_store")
# 获取已经存在的集合
collection = client.get_collection(name="knowledge_base")
# query_texts 查询文本
# n_results 返回最相似的两条结果
results = collection.query(query_texts=["如何入门机器学习"], n_results=2)
# print(results)
# {
# "ids": [["doc_1", "doc_2"]],
# "embeddings": None,
# "documents": [
# ["机器学习包含监督学习和无监督学习", "Python 拥有丰富的数据科学生态"]
# ],
# "uris": None,
# "included": ["metadatas", "documents", "distances"],
# "data": None,
# "metadatas": [
# [{"level": "intro", "topic": "ml"}, {"topic": "python", "level": "beginner"}]
# ],
# "distances": [[0.24633410573005676, 0.8512163758277893]],
# }
for idx, (doc, metadata, distances, doc_id) in enumerate(
zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0],
results["ids"][0],
),
1,
):
print(f"结果{idx}")
print(f"文档ID{doc_id}")
print(f"匹配文档{doc}")
print(f"附加信息{metadata}")
print(f"相似度距离{distances}")
print("-" * 50)
+64
View File
@@ -0,0 +1,64 @@
# 完整流程
from chromadb import PersistentClient
# 创建持久化客户端
client = PersistentClient(path="./chromadb_store")
# 获取或者创建集合
collection = client.get_or_create_collection(name="example")
# 准备说明文档
documents = [
"机器学习包含监督学习和无监督学习",
"Python 拥有丰富的数据科学生态",
"数据库可以持久化结构化或非结构化数据",
]
# 创建元数据
metadatas = [
{"topic": "ml", "level": "intro"},
{"topic": "python", "level": "beginner"},
{"topic": "database", "level": "intro"},
]
# ids
ids = ["doc1", "doc2", "dic3"]
# 写入数据
collection.add(documents=documents, metadatas=metadatas, ids=ids)
abc = collection.get(ids=["doc2"])
print(abc)
# 查询
result = collection.query(query_texts=["如何入门机器学习"], n_results=2)
# print(result)
# {
# "ids": [["doc1", "doc2"]],
# "embeddings": None,
# "documents": [
# ["机器学习包含监督学习和无监督学习", "Python 拥有丰富的数据科学生态"]
# ],
# "uris": None,
# "included": ["metadatas", "documents", "distances"],
# "data": None,
# "metadatas": [
# [{"topic": "ml", "level": "intro"}, {"topic": "python", "level": "beginner"}]
# ],
# "distances": [[0.24633410573005676, 0.8512163758277893]],
# }
# for index, (id, doc, metadata, distance) in enumerate(
# zip(
# result["ids"][0],
# result["documents"][0],
# result["metadatas"][0],
# result["distances"][0],
# ),
# 1,
# ):
# print(f"匹配结果 {index}:")
# print(f" 文档:{doc}")
# print(f" 元数据:{metadata}")
# print(f" 距离:{distance:.4f}")
# print()