feat: rag
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+410
@@ -0,0 +1,410 @@
|
||||
# 导入PyMuPDF库(fitz),用于处理PDF文件
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# 导入Optional类型提示
|
||||
from typing import Optional
|
||||
|
||||
# 导入日志logging功能
|
||||
import logging
|
||||
|
||||
# 获取当前模块日志记录器
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# 定义用于提取PDF所有文本内容的函数
|
||||
def extract_pdf_text(pdf_path: str) -> str:
|
||||
"""
|
||||
提取PDF文件中的所有文本内容
|
||||
|
||||
参数:
|
||||
pdf_path (str): PDF文件路径
|
||||
|
||||
返回:
|
||||
str: 合并后的所有页文本
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
Exception: PDF文件读取失败
|
||||
"""
|
||||
try:
|
||||
# 打开PDF文件
|
||||
pdf = fitz.open(pdf_path)
|
||||
try:
|
||||
# 新建一个空列表,用来存储每页文本
|
||||
text_list = []
|
||||
# 遍历每一页
|
||||
for page in pdf:
|
||||
# 获取当前页文本,并加入列表
|
||||
text_list.append(page.get_text("text")) # type: ignore
|
||||
# 将每页文本用换行拼接成一个大字符串
|
||||
all_text = "\n".join(text_list)
|
||||
# 返回拼接后的文本
|
||||
return all_text
|
||||
finally:
|
||||
# 确保关闭PDF文件
|
||||
pdf.close()
|
||||
except FileNotFoundError:
|
||||
# 如果文件未找到,记录错误日志
|
||||
logger.error(f"PDF文件不存在: {pdf_path}")
|
||||
# 向上抛出异常
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其他异常情况,记录错误信息
|
||||
logger.error(f"提取PDF文本失败: {pdf_path}, 错误: {str(e)}")
|
||||
# 抛出异常
|
||||
raise
|
||||
|
||||
|
||||
# 导入python-docx的Document类
|
||||
from docx import Document
|
||||
|
||||
|
||||
# 定义提取Word文档所有段落文本的函数
|
||||
def extract_text_from_word(file_path: str) -> str:
|
||||
"""
|
||||
从Word文档中提取所有段落的文本,并以字符串返回。
|
||||
|
||||
参数:
|
||||
file_path (str): Word文档的路径
|
||||
|
||||
返回:
|
||||
str: 文本内容字符串
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
Exception: Word文件读取失败
|
||||
"""
|
||||
try:
|
||||
# 加载Word文档
|
||||
doc = Document(file_path)
|
||||
# 取所有段落的文本,并用换行符拼接
|
||||
text = "\n".join([para.text for para in doc.paragraphs])
|
||||
# 返回拼接好的文本
|
||||
return text
|
||||
except FileNotFoundError:
|
||||
# 文件未找到时记录日志
|
||||
logger.error(f"Word文件不存在: {file_path}")
|
||||
# 抛出异常
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其它异常记录错误信息
|
||||
logger.error(f"提取Word文本失败: {file_path}, 错误: {str(e)}")
|
||||
# 抛出异常
|
||||
raise
|
||||
|
||||
|
||||
# 导入openpyxl库,用于操作Excel文件
|
||||
import openpyxl
|
||||
|
||||
|
||||
# 定义函数提取Excel文件中的所有文本
|
||||
def extract_text_from_excel(file_path: str) -> str:
|
||||
"""
|
||||
从Excel文件中提取所有单元格内容为文本,并以字符串返回。
|
||||
|
||||
参数:
|
||||
file_path (str): Excel文件路径
|
||||
|
||||
返回:
|
||||
str: 文本内容字符串
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
Exception: Excel文件读取失败
|
||||
"""
|
||||
try:
|
||||
# 加载Excel工作簿
|
||||
wb = openpyxl.load_workbook(file_path, data_only=True)
|
||||
try:
|
||||
# 取得活动工作表
|
||||
ws = wb.active
|
||||
# 新建空列表保存每一行字符串
|
||||
rows = []
|
||||
# 遍历所有行,只取单元格的值
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
# 将每行单元格内容用Tab连接,空值转换为空字符串
|
||||
rows.append(
|
||||
"\t".join([str(cell) if cell is not None else "" for cell in row])
|
||||
)
|
||||
# 用换行符拼接所有行
|
||||
all_text = "\n".join(rows)
|
||||
# 返回最终文本
|
||||
return all_text
|
||||
finally:
|
||||
# 关闭Excel工作簿
|
||||
wb.close()
|
||||
except FileNotFoundError:
|
||||
# 文件未找到时日志记录
|
||||
logger.error(f"Excel文件不存在: {file_path}")
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其它异常日志并抛出
|
||||
logger.error(f"提取Excel文本失败: {file_path}, 错误: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# 导入python-pptx库的Presentation类
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
# 定义函数提取PPT文件所有文本内容
|
||||
def extract_ppt_text(file_path: str) -> str:
|
||||
"""
|
||||
提取PPT文件中的所有文本内容,并以字符串返回。
|
||||
|
||||
参数:
|
||||
file_path (str): PPT文件路径
|
||||
|
||||
返回:
|
||||
str: 所有文本内容(以换行符分隔)
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
Exception: PPT文件读取失败
|
||||
"""
|
||||
try:
|
||||
# 加载PPT文件
|
||||
ppt = Presentation(file_path)
|
||||
# 新建列表存储所有文本内容
|
||||
text_list = []
|
||||
# 遍历PPT中的每张幻灯片
|
||||
for slide in ppt.slides:
|
||||
# 遍历当前幻灯片的每个形状
|
||||
for shape in slide.shapes:
|
||||
# 判断是否含有文本,且文本不为空
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
# 有文本时加入结果列表
|
||||
text_list.append(shape.text)
|
||||
# 用换行符拼接所有文本
|
||||
all_text = "\n".join(text_list)
|
||||
# 返回所有文本内容
|
||||
return all_text
|
||||
except FileNotFoundError:
|
||||
# 文件未找到时日志打印
|
||||
logger.error(f"PPT文件不存在: {file_path}")
|
||||
raise
|
||||
except Exception as e:
|
||||
# 处理其它异常
|
||||
logger.error(f"提取PPT文本失败: {file_path}, 错误: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# 导入BeautifulSoup用于解析HTML
|
||||
from bs4 import BeautifulSoup # BeautifulSoup用于解析HTML
|
||||
|
||||
|
||||
# 定义函数,从HTML文件提取所有文本内容
|
||||
def extract_text_from_html(file_path: str) -> str:
|
||||
"""
|
||||
从指定HTML文件中提取所有文本内容
|
||||
|
||||
参数:
|
||||
file_path (str): HTML文件路径
|
||||
|
||||
返回:
|
||||
str: 提取的文本内容
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
Exception: HTML文件读取失败
|
||||
"""
|
||||
try:
|
||||
# 以utf-8编码方式打开HTML文件
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
# 读取HTML文件所有内容
|
||||
html = f.read()
|
||||
# 创建BeautifulSoup对象
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# 用换行分隔符获取全部文本
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
# 返回文本
|
||||
return text
|
||||
except FileNotFoundError:
|
||||
# 文件不存在,记录日志
|
||||
logger.error(f"HTML文件不存在: {file_path}")
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其它异常记录并抛出
|
||||
logger.error(f"提取HTML文本失败: {file_path}, 错误: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# 导入内置json库
|
||||
import json
|
||||
|
||||
|
||||
# 定义提取JSON文件文本内容的函数
|
||||
def extract_text_from_json(filename: str) -> str:
|
||||
"""
|
||||
从JSON文件中提取文本内容并格式化为字符串
|
||||
|
||||
参数:
|
||||
filename (str): JSON文件路径
|
||||
|
||||
返回:
|
||||
str: 格式化后的JSON文本内容
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
json.JSONDecodeError: JSON解析失败
|
||||
"""
|
||||
try:
|
||||
# 以utf-8编码打开JSON文件
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
# 加载JSON内容到Python对象
|
||||
data = json.load(f)
|
||||
# 格式化JSON为缩进文本,显示中文
|
||||
text = json.dumps(data, ensure_ascii=False, indent=2)
|
||||
# 返回字符串格式JSON内容
|
||||
return text
|
||||
except FileNotFoundError:
|
||||
# 文件不存在时记录日志
|
||||
logger.error(f"JSON文件不存在: {filename}")
|
||||
raise
|
||||
except json.JSONDecodeError as e:
|
||||
# JSON解析异常日志
|
||||
logger.error(f"JSON解析失败: {filename}, 错误: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# 导入lxml库的etree模块用于XML处理
|
||||
from lxml import etree
|
||||
|
||||
|
||||
# 定义函数,从XML文件提取所有文本内容
|
||||
def extract_xml_text(file_path: str) -> str:
|
||||
"""
|
||||
读取XML文件并提取所有文本内容
|
||||
|
||||
参数:
|
||||
file_path (str): XML文件路径
|
||||
|
||||
返回:
|
||||
str: 提取的所有文本内容
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
etree.XMLSyntaxError: XML解析失败
|
||||
"""
|
||||
try:
|
||||
# 用utf-8编码打开XML文件
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
# 读取XML字符串内容
|
||||
xml = f.read()
|
||||
# 解析为XML树结构对象
|
||||
root = etree.fromstring(xml.encode("utf-8"))
|
||||
# 遍历所有文本节点并用空格拼接
|
||||
text = " ".join(root.itertext())
|
||||
# 返回拼接后的文本
|
||||
return text
|
||||
except FileNotFoundError:
|
||||
# 文件不存在日志
|
||||
logger.error(f"XML文件不存在: {file_path}")
|
||||
raise
|
||||
except etree.XMLSyntaxError as e:
|
||||
# XML语法异常日志
|
||||
logger.error(f"XML解析失败: {file_path}, 错误: {str(e)}")
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其它异常日志
|
||||
logger.error(f"提取XML文本失败: {file_path}, 错误: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# 导入csv模块
|
||||
import csv
|
||||
|
||||
|
||||
# 定义读取CSV内容并串成字符串的函数
|
||||
def read_csv_to_text(filename: str) -> str:
|
||||
"""
|
||||
读取CSV文件内容,并将每行用逗号连接,所有行用换行符拼接成一个字符串返回。
|
||||
|
||||
参数:
|
||||
filename (str): CSV文件路径
|
||||
|
||||
返回:
|
||||
str: 拼接后的字符串
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
"""
|
||||
try:
|
||||
# 以utf-8编码方式打开CSV文件
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
# 创建csv.reader对象逐行读取
|
||||
reader = csv.reader(f)
|
||||
# 每行用逗号拼接并放到列表
|
||||
rows = [", ".join(row) for row in reader]
|
||||
# 用换行拼接所有行
|
||||
all_text = "\n".join(rows)
|
||||
# 返回结果
|
||||
return all_text
|
||||
except FileNotFoundError:
|
||||
# 文件不存在日志
|
||||
logger.error(f"CSV文件不存在: {filename}")
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其它异常日志
|
||||
logger.error(f"读取CSV文件失败: {filename}, 错误: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# 定义读取文本文件内容的函数
|
||||
def read_text_file(filename: str) -> str:
|
||||
"""
|
||||
读取指定文本文件内容并返回
|
||||
|
||||
参数:
|
||||
filename (str): 文件路径
|
||||
|
||||
返回:
|
||||
str: 文件内容字符串
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
"""
|
||||
try:
|
||||
# 以utf-8只读方式打开文本文件
|
||||
with open(filename, "r", encoding="utf-8") as f:
|
||||
# 读取文件的所有内容
|
||||
text = f.read()
|
||||
# 返回字符串
|
||||
return text
|
||||
except FileNotFoundError:
|
||||
# 文件未找到记录日志
|
||||
logger.error(f"文本文件不存在: {filename}")
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其它异常情况日志记录
|
||||
logger.error(f"读取文本文件失败: {filename}, 错误: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# 定义读取Markdown文件内容的函数
|
||||
def read_markdown_file(file_path: str) -> str:
|
||||
"""
|
||||
读取Markdown文件内容并返回
|
||||
|
||||
参数:
|
||||
file_path (str): Markdown文件路径
|
||||
|
||||
返回:
|
||||
str: 文件内容字符串
|
||||
|
||||
异常:
|
||||
FileNotFoundError: 文件不存在
|
||||
"""
|
||||
try:
|
||||
# 以utf-8编码只读打开Markdown文件
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
# 读取并返回全部内容
|
||||
return f.read()
|
||||
except FileNotFoundError:
|
||||
# 文件不存在日志
|
||||
logger.error(f"Markdown文件不存在: {file_path}")
|
||||
raise
|
||||
except Exception as e:
|
||||
# 其它异常日志
|
||||
logger.error(f"读取Markdown文件失败: {file_path}, 错误: {str(e)}")
|
||||
raise
|
||||
@@ -0,0 +1,57 @@
|
||||
import os
|
||||
import logging
|
||||
|
||||
import extract
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extractTextAuto(file_path: str) -> str:
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"文件不存在:{file_path}")
|
||||
# 获取文件拓展名
|
||||
ext = os.path.splitext(file_path)[-1].lower()
|
||||
try:
|
||||
# 如果是pdf文件
|
||||
if ext == ".pdf":
|
||||
logger.info(f"检测到PDF文件,开始提取文本: {file_path}")
|
||||
return extract.extract_pdf_text(file_path)
|
||||
# 如果是Word文档
|
||||
elif ext in [".docx", ".doc"]:
|
||||
logger.info(f"检测到Word文件,开始提取文本: {file_path}")
|
||||
return extract.extract_text_from_word(file_path)
|
||||
# 如果是Excel文件
|
||||
elif ext in [".xlsx", ".xls"]:
|
||||
logger.info(f"检测到Excel文件,开始提取文本: {file_path}")
|
||||
return extract.extract_text_from_excel(file_path)
|
||||
# 如果是PPT文件
|
||||
elif ext in [".pptx", ".ppt"]:
|
||||
logger.info(f"检测到PPT文件,开始提取文本: {file_path}")
|
||||
return extract.extract_ppt_text(file_path)
|
||||
# 如果是HTML文件
|
||||
elif ext in [".html", ".htm"]:
|
||||
logger.info(f"检测到HTML文件,开始提取文本: {file_path}")
|
||||
return extract.extract_text_from_html(file_path)
|
||||
# 如果是XML文件
|
||||
elif ext == ".xml":
|
||||
logger.info(f"检测到XML文件,开始提取文本: {file_path}")
|
||||
return extract.extract_xml_text(file_path)
|
||||
# 如果是CSV文件
|
||||
elif ext == ".csv":
|
||||
logger.info(f"检测到CSV文件,开始提取文本: {file_path}")
|
||||
return extract.read_csv_to_text(file_path)
|
||||
# 如果是JSON文件
|
||||
elif ext == ".json":
|
||||
logger.info(f"检测到JSON文件,开始提取文本: {file_path}")
|
||||
return extract.extract_text_from_json(file_path)
|
||||
# 如果是纯文本、Markdown、JSONL文件
|
||||
elif ext in [".md", ".txt", ".jsonl"]:
|
||||
logger.info(f"检测到文本/Markdown/JSONL文件,开始读取: {file_path}")
|
||||
return extract.read_text_file(file_path)
|
||||
# 其余不支持的文件类型
|
||||
else:
|
||||
logger.error(f"不支持的文件类型: {ext}")
|
||||
raise ValueError(f"不支持的文件类型: {ext}")
|
||||
|
||||
except Exception as e:
|
||||
raise
|
||||
+55
@@ -0,0 +1,55 @@
|
||||
import os
|
||||
|
||||
# Install SDK: pip install 'volcengine-python-sdk[ark]'
|
||||
# from volcenginesdkarkruntime import Ark
|
||||
|
||||
# client = Ark(
|
||||
# # The base URL for model invocation
|
||||
# base_url="https://ark.cn-beijing.volces.com/api/v3/chat/completions",
|
||||
# api_key=os.getenv("ARK_API_KEY", "79b39c58-56db-4d8a-a8f8-84b95fca08db"),
|
||||
# )
|
||||
|
||||
# completion = client.chat.completions.create(
|
||||
# # Replace with Model ID
|
||||
# model="doubao-seed-1-6-lite-251015",
|
||||
# messages=[
|
||||
# {
|
||||
# "role": "system",
|
||||
# "content": "请将下面内容进行结构化处理:火山方舟是火山引擎推出的大模型服务平台,提供模型训练、推理、评测、精调等全方位功能与服务,并重点支撑大模型生态。 火山方舟通过稳定可靠的安全互信方案,保障模型提供方的模型安全与模型使用者的信息安全,加速大模型能力渗透到千行百业,助力模型提供方和使用者实现商业新增长。",
|
||||
# },
|
||||
# ],
|
||||
# )
|
||||
|
||||
# print(completion.choices[0].message.content)
|
||||
|
||||
# 使用豆包来向量化文本
|
||||
|
||||
import requests
|
||||
|
||||
VOLC_EMBEDDINGS_API_URL = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
|
||||
VOLC_API_KEY = "79b39c58-56db-4d8a-a8f8-84b95fca08db"
|
||||
|
||||
|
||||
def get_doubao_llm(prompt):
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {VOLC_API_KEY}",
|
||||
}
|
||||
params = {
|
||||
"model": "doubao-seed-1-6-lite-251015",
|
||||
"messages": [
|
||||
{"role": "system", "content": f"{prompt}"},
|
||||
],
|
||||
}
|
||||
response = requests.post(VOLC_EMBEDDINGS_API_URL, json=params, headers=headers)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(data)
|
||||
message = data["choices"][0]["message"]["content"]
|
||||
return message
|
||||
else:
|
||||
raise Exception(f"Embedding API error:{response.text}")
|
||||
|
||||
|
||||
answer = get_doubao_llm("红楼梦的作者是谁")
|
||||
print(answer)
|
||||
+118
@@ -0,0 +1,118 @@
|
||||
import os
|
||||
from typing import Optional, List
|
||||
import logging
|
||||
|
||||
import chromadb
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from llm import get_doubao_llm
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 默认集合的名称
|
||||
DEFAULT_COLLECTION_NAME = "rag_system"
|
||||
# 返回几条数据
|
||||
DEFAULT_N_RESULTS = 2
|
||||
# 默认向量化模型的名称
|
||||
DEFAULT_MODEL_NAME = "all-MiniLM-L6-v2"
|
||||
# 定义向量模型的全局变量
|
||||
_mode: Optional[SentenceTransformer] = None
|
||||
# 定义chromadb客户端
|
||||
_client: Optional[chromadb.PersistentClient] = None
|
||||
_collection: Optional[chromadb.Collection] = None
|
||||
|
||||
# 默认数据库存放路径
|
||||
DEFAULT_DB_PATH = "./chroma_db"
|
||||
|
||||
|
||||
def _get_model():
|
||||
global _mode
|
||||
if _mode is None:
|
||||
_mode = SentenceTransformer(DEFAULT_MODEL_NAME)
|
||||
return _mode
|
||||
|
||||
|
||||
def _get_client():
|
||||
global _client
|
||||
if _client is None:
|
||||
_client = chromadb.PersistentClient(path=DEFAULT_DB_PATH)
|
||||
return _client
|
||||
|
||||
|
||||
def get_query_embedding(query: str) -> List[float]:
|
||||
model = _get_model()
|
||||
embedding = model.encode([query])[0].tolist()
|
||||
return embedding
|
||||
|
||||
|
||||
def _get_collection(collection_name: str = DEFAULT_COLLECTION_NAME):
|
||||
global _collection
|
||||
if _collection is None:
|
||||
client = _get_client()
|
||||
_collection = client.get_or_create_collection(collection_name)
|
||||
return _collection
|
||||
|
||||
|
||||
def retrieve_relate_chunks(
|
||||
query_embedding: List[float],
|
||||
n_results: int = DEFAULT_N_RESULTS,
|
||||
collection_name: str = DEFAULT_COLLECTION_NAME,
|
||||
):
|
||||
try:
|
||||
collection = _get_collection(collection_name)
|
||||
# print(n_results)
|
||||
# 去指定集合查找相似度检索,找到数据
|
||||
results = collection.query(
|
||||
query_embeddings=[query_embedding], n_results=n_results
|
||||
)
|
||||
related_chunks = results.get("documents")
|
||||
if not related_chunks or not related_chunks[0]:
|
||||
raise ValueError("未找到相关内容")
|
||||
return related_chunks[0]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"向量检索失败:{str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def query_rag(
|
||||
query: str,
|
||||
n_results: int = DEFAULT_N_RESULTS,
|
||||
collection_name: str = DEFAULT_COLLECTION_NAME,
|
||||
):
|
||||
"""
|
||||
查询函数:
|
||||
query:用户查询的问题
|
||||
n_results:检索数量
|
||||
collection_name: 集合名字
|
||||
"""
|
||||
# 1. 将查询问题转为向量
|
||||
query_embedding = get_query_embedding(query)
|
||||
# print(query_embedding)
|
||||
# 基于查询向量做检索
|
||||
related_chunks = retrieve_relate_chunks(
|
||||
query_embedding, n_results, collection_name=collection_name
|
||||
)
|
||||
# print("related_chunks", related_chunks)
|
||||
content = "\n".join(related_chunks)
|
||||
prompt = f"""
|
||||
已知信息:{content}
|
||||
请根据上述内容回答用户问题:{query}
|
||||
"""
|
||||
print(prompt)
|
||||
answer = get_doubao_llm(prompt)
|
||||
return answer
|
||||
|
||||
|
||||
query = "西游记是谁写的"
|
||||
|
||||
try:
|
||||
answer = query_rag(query, n_results=1)
|
||||
print(f"答案:", answer)
|
||||
except ValueError as e:
|
||||
print(f"错误{e}")
|
||||
except Exception as e:
|
||||
print(f"错误{e}")
|
||||
+65
@@ -0,0 +1,65 @@
|
||||
from typing import Optional
|
||||
import logging
|
||||
import os
|
||||
|
||||
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from vectorstore import save_text_to_db
|
||||
from extract_text_auto import extractTextAuto
|
||||
|
||||
# 日志打印格式
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 默认保存chromadb集合名称
|
||||
DEFAULT_COLLECTION_NAME = "rag_system"
|
||||
# 默认分块大小
|
||||
DEFAULT_CHUNK_SIZE = 200
|
||||
# 默认分块重叠度
|
||||
DEFAULT_CHUNK_OVERLAP = 30
|
||||
|
||||
|
||||
def doc_to_vectorstore(
|
||||
file_path: str,
|
||||
collection_name: str = DEFAULT_COLLECTION_NAME,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
||||
) -> int:
|
||||
"""
|
||||
提供文档内容,并分块保存到向量数据库中
|
||||
参数:
|
||||
file_path:文件路径
|
||||
collection_name:集合名称
|
||||
chunk_size:分块大小
|
||||
chunk_overlap:分块重叠
|
||||
"""
|
||||
# 1. 先加载文件
|
||||
text = extractTextAuto(file_path)
|
||||
print(text)
|
||||
if not text.strip():
|
||||
logger.warning(f"文件内容为空:{file_path}")
|
||||
return 0
|
||||
|
||||
# 2.进行分块
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
chunks = text_splitter.split_text(text)
|
||||
logger.info(f"文件分块完成,共分为{len(chunks)}块")
|
||||
|
||||
# 3.将分好的块,保存到向量化,且保存到向量数据库中
|
||||
success_count = 0
|
||||
for idx, chunk in enumerate(chunks):
|
||||
try:
|
||||
save_text_to_db(chunk, collection_name=collection_name)
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
logger.error(f"保存第{idx+1}块失败:{str(e)}")
|
||||
logger.info(
|
||||
f"文件{file_path}已经完成向量化并入库,成功保存{success_count}/{len(chunks)}"
|
||||
)
|
||||
|
||||
|
||||
doc_to_vectorstore("西游记.txt")
|
||||
@@ -0,0 +1,69 @@
|
||||
import chromadb
|
||||
from typing import Optional
|
||||
import logging
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 默认集合名称
|
||||
DEFAULT_COLLECTION_NAME = "rag"
|
||||
# 默认向量化模型名称
|
||||
DEFAULT_MODEL_NAME = "all-MiniLM-L6-v2"
|
||||
# 默认数据库存放路径
|
||||
DEFAULT_DB_PATH = "./chroma_db"
|
||||
|
||||
# 定义全局mode
|
||||
_model: Optional[SentenceTransformer] = None
|
||||
# 定义全局客户端
|
||||
_client: Optional[chromadb.PersistentClient] = None
|
||||
|
||||
|
||||
def _get_mode():
|
||||
global _model
|
||||
if _model is None:
|
||||
_model = SentenceTransformer(DEFAULT_MODEL_NAME)
|
||||
return _model
|
||||
|
||||
|
||||
def _get_client():
|
||||
global _client
|
||||
if _client is None:
|
||||
_client = chromadb.PersistentClient(path=DEFAULT_DB_PATH)
|
||||
return _client
|
||||
|
||||
|
||||
def save_text_to_db(text: str, collection_name=DEFAULT_COLLECTION_NAME):
|
||||
try:
|
||||
if not text or not text.strip():
|
||||
logger.warning("空文本,已跳过")
|
||||
return ""
|
||||
|
||||
# 获取模型
|
||||
mode = _get_mode()
|
||||
# 获取客户端
|
||||
client = _get_client()
|
||||
# 创建集合
|
||||
collection = client.get_or_create_collection(collection_name)
|
||||
# 创建hash id
|
||||
text_id = hashlib.md5(text.encode("utf-8")).hexdigest()
|
||||
existing = collection.get(ids=[text_id])
|
||||
if existing and existing.get("ids"):
|
||||
logger.info(f"此文本已保存过,跳过保存,id={text_id}")
|
||||
return text_id
|
||||
# 生成文本的embedding模型处理结果 ndarray,通过tolist转为列表
|
||||
embedding = mode.encode([text])[0].tolist()
|
||||
|
||||
# 添加到向量数据库中
|
||||
collection.add(
|
||||
documents=[text],
|
||||
embeddings=[embedding],
|
||||
ids=[text_id],
|
||||
metadatas=[{"source": "document"}],
|
||||
)
|
||||
return text_id
|
||||
except Exception as e:
|
||||
logger.error(f"保存文本向量库失败{str(e)}")
|
||||
raise
|
||||
@@ -0,0 +1 @@
|
||||
西游记作者吴承恩
|
||||
Reference in New Issue
Block a user