from collections import defaultdict import jieba # 定义示例文档数据,每个文档包含“id”、“分区键”、“内容” documents = [ {"id": 1, "partition_key": "张", "content": "张三喜欢编程和电脑游戏"}, {"id": 2, "partition_key": "李", "content": "李四热爱计算机科学"}, {"id": 3, "partition_key": "王", "content": "王五喜欢阅读技术书籍"}, {"id": 4, "partition_key": "张", "content": "张杰钟爱笔记本电脑"}, ] # 定义一个同义词的映射 键为代表词,值为同义词的集合 用于扩展匹配 synonym_map = { "电脑": {"电脑", "计算机", "PC"}, "编程": {"编程", "软件开发", "程序设计"}, } # 聚合文档,将同类的数据进行聚合 def build_partitions(docs): partitions = defaultdict(list) for doc in docs: partitions[doc["partition_key"]].append(doc) return partitions # 通过jieba进行分词 def tokenize(text): return [token for token in jieba.lcut_for_search(text) if token.strip()] # 进行分词匹配 def match_query(partitions, keywords, use_synonym=True): tokens_to_match = {keywords} if keywords in synonym_map and use_synonym: tokens_to_match = synonym_map[keywords] # {"电脑", "计算机", "PC"} result = [] for key, docs in partitions.items(): for doc in docs: tokens = tokenize(doc["content"]) # 通过jieba进行分词 # 如果有交集,则说明匹配 if tokens_to_match.intersection(tokens): result.append((key, doc["id"], doc["content"])) return result # 1.使用分区键进行聚合文档 partitions = build_partitions(documents) # print( # partitions # ) # {'张': [{'id': 1, 'partition_key': '张', 'content': '张三喜欢编程和电脑游戏'}, {'id': 4, 'partition_key': '张', 'content': '张杰钟爱笔记本电脑'}], '李': [{'id': 2, 'partition_key': '李', 'content': '李四热爱计算机科学'}], '王': [{'id': 3, 'partition_key': '王', 'content': '王五喜欢阅读技术书籍'}]}) # 2. 进行分词匹配 hits = match_query(partitions, keywords="电脑", use_synonym=True) print(hits)