Files
02vector/jieba_anli.py
2026-05-06 11:29:40 +08:00

56 lines
2.1 KiB
Python

from collections import defaultdict
import jieba
# 定义示例文档数据,每个文档包含“id”、“分区键”、“内容”
documents = [
{"id": 1, "partition_key": "", "content": "张三喜欢编程和电脑游戏"},
{"id": 2, "partition_key": "", "content": "李四热爱计算机科学"},
{"id": 3, "partition_key": "", "content": "王五喜欢阅读技术书籍"},
{"id": 4, "partition_key": "", "content": "张杰钟爱笔记本电脑"},
]
# 定义一个同义词的映射 键为代表词,值为同义词的集合 用于扩展匹配
synonym_map = {
"电脑": {"电脑", "计算机", "PC"},
"编程": {"编程", "软件开发", "程序设计"},
}
# 聚合文档,将同类的数据进行聚合
def build_partitions(docs):
partitions = defaultdict(list)
for doc in docs:
partitions[doc["partition_key"]].append(doc)
return partitions
# 通过jieba进行分词
def tokenize(text):
return [token for token in jieba.lcut_for_search(text) if token.strip()]
# 进行分词匹配
def match_query(partitions, keywords, use_synonym=True):
tokens_to_match = {keywords}
if keywords in synonym_map and use_synonym:
tokens_to_match = synonym_map[keywords] # {"电脑", "计算机", "PC"}
result = []
for key, docs in partitions.items():
for doc in docs:
tokens = tokenize(doc["content"]) # 通过jieba进行分词
# 如果有交集,则说明匹配
if tokens_to_match.intersection(tokens):
result.append((key, doc["id"], doc["content"]))
return result
# 1.使用分区键进行聚合文档
partitions = build_partitions(documents)
# print(
# partitions
# ) # {'张': [{'id': 1, 'partition_key': '张', 'content': '张三喜欢编程和电脑游戏'}, {'id': 4, 'partition_key': '张', 'content': '张杰钟爱笔记本电脑'}], '李': [{'id': 2, 'partition_key': '李', 'content': '李四热爱计算机科学'}], '王': [{'id': 3, 'partition_key': '王', 'content': '王五喜欢阅读技术书籍'}]})
# 2. 进行分词匹配
hits = match_query(partitions, keywords="电脑", use_synonym=True)
print(hits)