56 lines
2.1 KiB
Python
56 lines
2.1 KiB
Python
from collections import defaultdict
|
|
import jieba
|
|
|
|
# 定义示例文档数据,每个文档包含“id”、“分区键”、“内容”
|
|
documents = [
|
|
{"id": 1, "partition_key": "张", "content": "张三喜欢编程和电脑游戏"},
|
|
{"id": 2, "partition_key": "李", "content": "李四热爱计算机科学"},
|
|
{"id": 3, "partition_key": "王", "content": "王五喜欢阅读技术书籍"},
|
|
{"id": 4, "partition_key": "张", "content": "张杰钟爱笔记本电脑"},
|
|
]
|
|
|
|
# 定义一个同义词的映射 键为代表词,值为同义词的集合 用于扩展匹配
|
|
synonym_map = {
|
|
"电脑": {"电脑", "计算机", "PC"},
|
|
"编程": {"编程", "软件开发", "程序设计"},
|
|
}
|
|
|
|
|
|
# 聚合文档,将同类的数据进行聚合
|
|
def build_partitions(docs):
|
|
partitions = defaultdict(list)
|
|
for doc in docs:
|
|
partitions[doc["partition_key"]].append(doc)
|
|
return partitions
|
|
|
|
|
|
# 通过jieba进行分词
|
|
def tokenize(text):
|
|
return [token for token in jieba.lcut_for_search(text) if token.strip()]
|
|
|
|
|
|
# 进行分词匹配
|
|
def match_query(partitions, keywords, use_synonym=True):
|
|
tokens_to_match = {keywords}
|
|
if keywords in synonym_map and use_synonym:
|
|
tokens_to_match = synonym_map[keywords] # {"电脑", "计算机", "PC"}
|
|
result = []
|
|
for key, docs in partitions.items():
|
|
for doc in docs:
|
|
tokens = tokenize(doc["content"]) # 通过jieba进行分词
|
|
# 如果有交集,则说明匹配
|
|
if tokens_to_match.intersection(tokens):
|
|
result.append((key, doc["id"], doc["content"]))
|
|
return result
|
|
|
|
|
|
# 1.使用分区键进行聚合文档
|
|
partitions = build_partitions(documents)
|
|
# print(
|
|
# partitions
|
|
# ) # {'张': [{'id': 1, 'partition_key': '张', 'content': '张三喜欢编程和电脑游戏'}, {'id': 4, 'partition_key': '张', 'content': '张杰钟爱笔记本电脑'}], '李': [{'id': 2, 'partition_key': '李', 'content': '李四热爱计算机科学'}], '王': [{'id': 3, 'partition_key': '王', 'content': '王五喜欢阅读技术书籍'}]})
|
|
|
|
# 2. 进行分词匹配
|
|
hits = match_query(partitions, keywords="电脑", use_synonym=True)
|
|
print(hits)
|