Files
02vector/jieba-student.py
T
2026-05-06 11:29:40 +08:00

38 lines
1.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import jieba
import jieba.analyse
text = "我爱自然语言处理技术"
result = jieba.cut(text, cut_all=False)
print("精确模式:", "/".join(result)) # 精确模式: 我/爱/自然语言/处理/技术
result = jieba.cut(text, cut_all=True)
print("全模式:", "/".join(result)) # 全模式: 我/爱/自然/自然语言/语言/处理/技术
result = jieba.cut_for_search(text)
print(
"搜索引擎模式:", "/".join(result)
) # 搜索引擎模式: 我/爱/自然/语言/自然语言/处理/技术
# 关键词提取
def main():
text = (
"自然语言处理是人工智能和语言学领域的重要分支,"
"研究如何让计算机理解和生成人类语言。"
)
# 使用使用 TF-IDF 算法获取关键词及权重
tfidy_keywords = jieba.analyse.extract_tags(text, topK=5, withWeight=True)
# 说明:使用 TextRank 算法获取关键词及权重
textrank_keywords = jieba.analyse.textrank(text, topK=5, withWeight=True)
print("TF-IDF")
for word, weight in tfidy_keywords:
print(f"{word}:{weight:.2f}")
# 说明:输出 TextRank 结果
print("\nTextRank")
for word, weight in textrank_keywords:
print(f"{word}: {weight:.4f}")
main()