Files
03Rag/11按照字符长度进行分割.py
heyong.fu a17c65c4bc feat: rag
2026-05-06 11:35:10 +08:00

24 lines
805 B
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from langchain_text_splitters import CharacterTextSplitter
# 创建字符分割器实例,设置每个块最大长度为100个字符,不重叠,使用空字符串进行分割
text_splitters = CharacterTextSplitter(
chunk_size=100, # 每个块的最大长度是100个字符
chunk_overlap=0, # 块之间不重叠
separator="", # 使用空白字符串作为分隔符
)
# 构建一个长文本
document = f"""{"1"*100}{"2"*100}{"3"*100}"""
# 使用分割器split_text方法,将原始文本切割成若干个字块
texts = text_splitters.split_text(document)
# 打印原始文本长度
print(f"原文长度{len(document)}")
# 打印分割后的块的数量
print(f"分割为{texts}个块")
for i, text in enumerate(texts, 1):
print(f"\n{i}({len(text)}字符){repr(text)}")