feat: rag

2026-05-06 11:35:10 +08:00
commit a17c65c4bc
75 changed files with 5196 additions and 0 deletions
@@ -0,0 +1,31 @@
+# 创建临时客户端
+
+import chromadb
+
+# 创建一个临时的内存客户端（不会保存到硬盘）
+client = chromadb.EphemeralClient()
+
+# 创建一个集合
+collection = client.create_collection(name="test")
+
+# 添加一条数据
+collection.add(
+    documents=["今天天气有风", "很冷", "注意保暖", "加油学习"],
+    ids=["test_1", "test_2", "test_3", "test_4"],
+)
+
+# 查询数据
+results = collection.query(query_texts=["天气"], n_results=2)
+
+print(f"打印数据结果{results}")
+
+# {
+#   'ids': [['test_1', 'test_2']],
+#   'embeddings': None,
+#   'documents': [['今天天气有风', '很冷']],
+#   'uris': None,
+#   'included': ['metadatas', 'documents', 'distances'],
+#   'data': None,
+#   'metadatas': [[None, None]],
+#   'distances': [[0.2988046705722809, 0.9478188753128052]]
+# }
@@ -0,0 +1,22 @@
+# 持久化存储
+
+import chromadb
+
+# 持久化客户端
+# path指定数据存储的路径
+# 如果目录不存在，Chromadb会自动创建
+persistent_client = chromadb.PersistentClient(path="./chromadb_store")
+
+
+# 创建一个集合（类似创建一个表）
+collection = persistent_client.create_collection(
+    name="notes", metadata={"description": "笔记集合"}  # 集合名称  # 集合元数据
+)
+
+# 列出所有集合，确认创建成功
+# list_collections() 返回所有集合的列表
+collections = persistent_client.list_collections()
+print(collections)
+
+for col in collections:
+    print(f"-{col.name}")
@@ -0,0 +1,23 @@
+# 获取已经存在的集合
+
+# 如果集合已经存在，可以使用get_collection() 或者 get_or_create_collection() 方法
+
+import chromadb
+
+# 创建持久化客户端
+client = chromadb.PersistentClient(path="./chromadb_store")
+
+# 方法1：获取已存在的集合
+try:
+    existring_collection = client.get_collection(name="notes")
+    print("集合已经存在", existring_collection.name)
+except Exception as e:
+    print("集合不存在", e)
+
+
+# 方法2：获取或者创建集合（推荐使用）
+
+collection = client.get_or_create_collection(
+    name="notes", metadata={"description": "笔记集合"}
+)
+print(collection.name)
@@ -0,0 +1,42 @@
+# 写入数据
+
+import chromadb
+
+# 创建持久化客户端
+
+client = chromadb.PersistentClient(path="./chromadb_store")
+
+
+# 创建集合
+collection = client.get_or_create_collection(name="knowledge_base")
+
+# 准备说明文档
+documents = [
+    "机器学习包含监督学习和无监督学习",
+    "Python 拥有丰富的数据科学生态",
+    "数据库可以持久化结构化或非结构化数据",
+]
+
+# 准备元组数据
+metadatas = [
+    {"topic": "ml", "level": "intro"},
+    {"topic": "python", "level": "beginner"},
+    {"topic": "database", "level": "intro"},
+]
+
+# 准备唯一标识
+# ids 是一个列表，每个元素对应一个文档的唯一ID
+# 如果不提供，Chromedb会自动生成
+ids = ["doc_1", "doc_2", "doc_3"]
+
+# 将数据添加到集合中
+# add() 方法会将文档转为向量
+collection.add(documents=documents, metadatas=metadatas, ids=ids)
+
+# 获取集合列表
+collections = client.list_collections()
+print(collections)
+
+# 查看集合中的文档
+doc_count = collection.count()
+print(doc_count)
@@ -0,0 +1,45 @@
+# 查询数据
+import chromadb
+
+# 创建持久化客户端
+client = chromadb.PersistentClient(path="./chromadb_store")
+
+# 获取已经存在的集合
+collection = client.get_collection(name="knowledge_base")
+
+# query_texts 查询文本
+# n_results 返回最相似的两条结果
+results = collection.query(query_texts=["如何入门机器学习"], n_results=2)
+
+# print(results)
+
+# {
+#     "ids": [["doc_1", "doc_2"]],
+#     "embeddings": None,
+#     "documents": [
+#         ["机器学习包含监督学习和无监督学习", "Python 拥有丰富的数据科学生态"]
+#     ],
+#     "uris": None,
+#     "included": ["metadatas", "documents", "distances"],
+#     "data": None,
+#     "metadatas": [
+#         [{"level": "intro", "topic": "ml"}, {"topic": "python", "level": "beginner"}]
+#     ],
+#     "distances": [[0.24633410573005676, 0.8512163758277893]],
+# }
+
+for idx, (doc, metadata, distances, doc_id) in enumerate(
+    zip(
+        results["documents"][0],
+        results["metadatas"][0],
+        results["distances"][0],
+        results["ids"][0],
+    ),
+    1,
+):
+    print(f"结果{idx}")
+    print(f"文档ID{doc_id}")
+    print(f"匹配文档{doc}")
+    print(f"附加信息{metadata}")
+    print(f"相似度距离{distances}")
+    print("-" * 50)
@@ -0,0 +1,64 @@
+# 完整流程
+
+from chromadb import PersistentClient
+
+# 创建持久化客户端
+client = PersistentClient(path="./chromadb_store")
+
+# 获取或者创建集合
+collection = client.get_or_create_collection(name="example")
+
+# 准备说明文档
+documents = [
+    "机器学习包含监督学习和无监督学习",
+    "Python 拥有丰富的数据科学生态",
+    "数据库可以持久化结构化或非结构化数据",
+]
+# 创建元数据
+metadatas = [
+    {"topic": "ml", "level": "intro"},
+    {"topic": "python", "level": "beginner"},
+    {"topic": "database", "level": "intro"},
+]
+
+# ids
+ids = ["doc1", "doc2", "dic3"]
+
+# 写入数据
+collection.add(documents=documents, metadatas=metadatas, ids=ids)
+
+abc = collection.get(ids=["doc2"])
+print(abc)
+
+# 查询
+result = collection.query(query_texts=["如何入门机器学习"], n_results=2)
+
+# print(result)
+# {
+#     "ids": [["doc1", "doc2"]],
+#     "embeddings": None,
+#     "documents": [
+#         ["机器学习包含监督学习和无监督学习", "Python 拥有丰富的数据科学生态"]
+#     ],
+#     "uris": None,
+#     "included": ["metadatas", "documents", "distances"],
+#     "data": None,
+#     "metadatas": [
+#         [{"topic": "ml", "level": "intro"}, {"topic": "python", "level": "beginner"}]
+#     ],
+#     "distances": [[0.24633410573005676, 0.8512163758277893]],
+# }
+# for index, (id, doc, metadata, distance) in enumerate(
+#     zip(
+#         result["ids"][0],
+#         result["documents"][0],
+#         result["metadatas"][0],
+#         result["distances"][0],
+#     ),
+#     1,
+# ):
+#     print(f"匹配结果 {index}:")
+#     print(f"  文档：{doc}")
+#     print(f"  元数据：{metadata}")
+#     print(f"  距离：{distance:.4f}")
+#     print()