from typing import Optional import logging import os from langchain_text_splitters import RecursiveCharacterTextSplitter from vectorstore import save_text_to_db from extract_text_auto import extractTextAuto # 日志打印格式 logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" ) logger = logging.getLogger(__name__) # 默认保存chromadb集合名称 DEFAULT_COLLECTION_NAME = "rag_system" # 默认分块大小 DEFAULT_CHUNK_SIZE = 200 # 默认分块重叠度 DEFAULT_CHUNK_OVERLAP = 30 def doc_to_vectorstore( file_path: str, collection_name: str = DEFAULT_COLLECTION_NAME, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, ) -> int: """ 提供文档内容,并分块保存到向量数据库中 参数: file_path:文件路径 collection_name:集合名称 chunk_size:分块大小 chunk_overlap:分块重叠 """ # 1. 先加载文件 text = extractTextAuto(file_path) print(text) if not text.strip(): logger.warning(f"文件内容为空:{file_path}") return 0 # 2.进行分块 text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) chunks = text_splitter.split_text(text) logger.info(f"文件分块完成,共分为{len(chunks)}块") # 3.将分好的块,保存到向量化,且保存到向量数据库中 success_count = 0 for idx, chunk in enumerate(chunks): try: save_text_to_db(chunk, collection_name=collection_name) success_count += 1 except Exception as e: logger.error(f"保存第{idx+1}块失败:{str(e)}") logger.info( f"文件{file_path}已经完成向量化并入库,成功保存{success_count}/{len(chunks)}" ) doc_to_vectorstore("西游记.txt")