diff --git a/crazy_functional.py b/crazy_functional.py index 92bc2842cd..248262cc9b 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -21,13 +21,13 @@ def get_crazy_functions(): from crazy_functions.询问多个大语言模型 import 同时问询 from crazy_functions.SourceCode_Analyse import 解析一个Lua项目 from crazy_functions.SourceCode_Analyse import 解析一个CSharp项目 - from crazy_functions.总结word文档 import 总结word文档 from crazy_functions.解析JupyterNotebook import 解析ipynb文件 from crazy_functions.Conversation_To_File import 载入对话历史存档 from crazy_functions.Conversation_To_File import 对话历史存档 from crazy_functions.Conversation_To_File import Conversation_To_File_Wrap from crazy_functions.Conversation_To_File import 删除所有本地对话历史记录 from crazy_functions.辅助功能 import 清除缓存 + from crazy_functions.批量文件询问 import 批量文件询问 from crazy_functions.Markdown_Translate import Markdown英译中 from crazy_functions.批量总结PDF文档 import 批量总结PDF文档 from crazy_functions.PDF_Translate import 批量翻译PDF文档 @@ -110,12 +110,13 @@ def get_crazy_functions(): "Function": HotReload(Latex翻译中文并重新编译PDF), # 当注册Class后,Function旧接口仅会在“虚空终端”中起作用 "Class": Arxiv_Localize, # 新一代插件需要注册Class }, - "批量总结Word文档": { + "批量文件询问": { "Group": "学术", "Color": "stop", "AsButton": False, - "Info": "批量总结word文档 | 输入参数为路径", - "Function": HotReload(总结word文档), + "AdvancedArgs": True, + "Info": "通过在高级参数区写入prompt,可自定义询问逻辑,默认情况下为总结逻辑 | 输入参数为路径", + "Function": HotReload(批量文件询问), }, "解析整个Matlab项目": { "Group": "编程", diff --git a/crazy_functions/doc_fns/batch_file_query_doc.py b/crazy_functions/doc_fns/batch_file_query_doc.py new file mode 100644 index 0000000000..7fba2a8270 --- /dev/null +++ b/crazy_functions/doc_fns/batch_file_query_doc.py @@ -0,0 +1,450 @@ +import os +import time +from abc import ABC, abstractmethod +from datetime import datetime +from docx import Document +from docx.enum.style import WD_STYLE_TYPE +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING +from docx.oxml.ns import qn +from docx.shared import Inches, Cm +from docx.shared import Pt, RGBColor, Inches +from typing import Dict, List, Tuple + + +class DocumentFormatter(ABC): + """文档格式化基类,定义文档格式化的基本接口""" + + def __init__(self, final_summary: str, file_summaries_map: Dict, failed_files: List[Tuple]): + self.final_summary = final_summary + self.file_summaries_map = file_summaries_map + self.failed_files = failed_files + + @abstractmethod + def format_failed_files(self) -> str: + """格式化失败文件列表""" + pass + + @abstractmethod + def format_file_summaries(self) -> str: + """格式化文件总结内容""" + pass + + @abstractmethod + def create_document(self) -> str: + """创建完整文档""" + pass + + +class WordFormatter(DocumentFormatter): + """Word格式文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012),并进行了优化""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.doc = Document() + self._setup_document() + self._create_styles() + # 初始化三级标题编号系统 + self.numbers = { + 1: 0, # 一级标题编号 + 2: 0, # 二级标题编号 + 3: 0 # 三级标题编号 + } + + def _setup_document(self): + """设置文档基本格式,包括页面设置和页眉""" + sections = self.doc.sections + for section in sections: + # 设置页面大小为A4 + section.page_width = Cm(21) + section.page_height = Cm(29.7) + # 设置页边距 + section.top_margin = Cm(3.7) # 上边距37mm + section.bottom_margin = Cm(3.5) # 下边距35mm + section.left_margin = Cm(2.8) # 左边距28mm + section.right_margin = Cm(2.6) # 右边距26mm + # 设置页眉页脚距离 + section.header_distance = Cm(2.0) + section.footer_distance = Cm(2.0) + + # 添加页眉 + header = section.header + header_para = header.paragraphs[0] + header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT + header_run = header_para.add_run("该文档由GPT-academic生成") + header_run.font.name = '仿宋' + header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + header_run.font.size = Pt(9) + + def _create_styles(self): + """创建文档样式""" + # 创建正文样式 + style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH) + style.font.name = '仿宋' + style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋') + style.font.size = Pt(14) + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + style.paragraph_format.space_after = Pt(0) + style.paragraph_format.first_line_indent = Pt(28) + + # 创建各级标题样式 + self._create_heading_style('Title_Custom', '方正小标宋简体', 32, WD_PARAGRAPH_ALIGNMENT.CENTER) + self._create_heading_style('Heading1_Custom', '黑体', 22, WD_PARAGRAPH_ALIGNMENT.LEFT) + self._create_heading_style('Heading2_Custom', '黑体', 18, WD_PARAGRAPH_ALIGNMENT.LEFT) + self._create_heading_style('Heading3_Custom', '黑体', 16, WD_PARAGRAPH_ALIGNMENT.LEFT) + + def _create_heading_style(self, style_name: str, font_name: str, font_size: int, alignment): + """创建标题样式""" + style = self.doc.styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH) + style.font.name = font_name + style._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + style.font.size = Pt(font_size) + style.font.bold = True + style.paragraph_format.alignment = alignment + style.paragraph_format.space_before = Pt(12) + style.paragraph_format.space_after = Pt(12) + style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE + return style + + def _get_heading_number(self, level: int) -> str: + """ + 生成标题编号 + + Args: + level: 标题级别 (0-3) + + Returns: + str: 格式化的标题编号 + """ + if level == 0: # 主标题不需要编号 + return "" + + self.numbers[level] += 1 # 增加当前级别的编号 + + # 重置下级标题编号 + for i in range(level + 1, 4): + self.numbers[i] = 0 + + # 根据级别返回不同格式的编号 + if level == 1: + return f"{self.numbers[1]}. " + elif level == 2: + return f"{self.numbers[1]}.{self.numbers[2]} " + elif level == 3: + return f"{self.numbers[1]}.{self.numbers[2]}.{self.numbers[3]} " + return "" + + def _add_heading(self, text: str, level: int): + """ + 添加带编号的标题 + + Args: + text: 标题文本 + level: 标题级别 (0-3) + """ + style_map = { + 0: 'Title_Custom', + 1: 'Heading1_Custom', + 2: 'Heading2_Custom', + 3: 'Heading3_Custom' + } + + number = self._get_heading_number(level) + paragraph = self.doc.add_paragraph(style=style_map[level]) + + if number: + number_run = paragraph.add_run(number) + font_size = 22 if level == 1 else (18 if level == 2 else 16) + self._get_run_style(number_run, '黑体', font_size, True) + + text_run = paragraph.add_run(text) + font_size = 32 if level == 0 else (22 if level == 1 else (18 if level == 2 else 16)) + self._get_run_style(text_run, '黑体', font_size, True) + + # 主标题添加日期 + if level == 0: + date_paragraph = self.doc.add_paragraph() + date_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER + date_run = date_paragraph.add_run(datetime.now().strftime('%Y年%m月%d日')) + self._get_run_style(date_run, '仿宋', 16, False) + + return paragraph + + def _get_run_style(self, run, font_name: str, font_size: int, bold: bool = False): + """设置文本运行对象的样式""" + run.font.name = font_name + run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) + run.font.size = Pt(font_size) + run.font.bold = bold + + def format_failed_files(self) -> str: + """格式化失败文件列表""" + result = [] + if not self.failed_files: + return "\n".join(result) + + result.append("处理失败文件:") + for fp, reason in self.failed_files: + result.append(f"• {os.path.basename(fp)}: {reason}") + + self._add_heading("处理失败文件", 1) + for fp, reason in self.failed_files: + self._add_content(f"• {os.path.basename(fp)}: {reason}", indent=False) + self.doc.add_paragraph() + + return "\n".join(result) + + def _add_content(self, text: str, indent: bool = True): + """添加正文内容""" + paragraph = self.doc.add_paragraph(text, style='Normal_Custom') + if not indent: + paragraph.paragraph_format.first_line_indent = Pt(0) + return paragraph + + def format_file_summaries(self) -> str: + """ + 格式化文件总结内容,确保正确的标题层级 + + 返回: + str: 格式化后的文件总结字符串 + + 标题层级规则: + 1. 一级标题为"各文件详细总结" + 2. 如果文件有目录路径: + - 目录路径作为二级标题 (2.1, 2.2 等) + - 该目录下所有文件作为三级标题 (2.1.1, 2.1.2 等) + 3. 如果文件没有目录路径: + - 文件直接作为二级标题 (2.1, 2.2 等) + """ + result = [] + # 首先对文件路径进行分组整理 + file_groups = {} + for path in sorted(self.file_summaries_map.keys()): + dir_path = os.path.dirname(path) + if dir_path not in file_groups: + file_groups[dir_path] = [] + file_groups[dir_path].append(path) + + # 处理没有目录的文件 + root_files = file_groups.get("", []) + if root_files: + for path in sorted(root_files): + file_name = os.path.basename(path) + result.append(f"\n📄 {file_name}") + result.append(self.file_summaries_map[path]) + # 无目录的文件作为二级标题 + self._add_heading(f"📄 {file_name}", 2) + self._add_content(self.file_summaries_map[path]) + self.doc.add_paragraph() + + # 处理有目录的文件 + for dir_path in sorted(file_groups.keys()): + if dir_path == "": # 跳过已处理的根目录文件 + continue + + # 添加目录作为二级标题 + result.append(f"\n📁 {dir_path}") + self._add_heading(f"📁 {dir_path}", 2) + + # 该目录下的所有文件作为三级标题 + for path in sorted(file_groups[dir_path]): + file_name = os.path.basename(path) + result.append(f"\n📄 {file_name}") + result.append(self.file_summaries_map[path]) + + # 添加文件名作为三级标题 + self._add_heading(f"📄 {file_name}", 3) + self._add_content(self.file_summaries_map[path]) + self.doc.add_paragraph() + + return "\n".join(result) + + + def create_document(self): + """创建完整Word文档并返回文档对象""" + # 重置所有编号 + for level in self.numbers: + self.numbers[level] = 0 + + # 添加主标题 + self._add_heading("文档总结报告", 0) + self.doc.add_paragraph() + + # 添加总体摘要 + self._add_heading("总体摘要", 1) + self._add_content(self.final_summary) + self.doc.add_paragraph() + + # 添加失败文件列表(如果有) + if self.failed_files: + self.format_failed_files() + + # 添加文件详细总结 + self._add_heading("各文件详细总结", 1) + self.format_file_summaries() + + return self.doc + + +class MarkdownFormatter(DocumentFormatter): + """Markdown格式文档生成器""" + + def format_failed_files(self) -> str: + if not self.failed_files: + return "" + + formatted_text = ["\n## ⚠️ 处理失败的文件"] + for fp, reason in self.failed_files: + formatted_text.append(f"- {os.path.basename(fp)}: {reason}") + formatted_text.append("\n---") + return "\n".join(formatted_text) + + def format_file_summaries(self) -> str: + formatted_text = [] + sorted_paths = sorted(self.file_summaries_map.keys()) + current_dir = "" + + for path in sorted_paths: + dir_path = os.path.dirname(path) + if dir_path != current_dir: + if dir_path: + formatted_text.append(f"\n## 📁 {dir_path}") + current_dir = dir_path + + file_name = os.path.basename(path) + formatted_text.append(f"\n### 📄 {file_name}") + formatted_text.append(self.file_summaries_map[path]) + formatted_text.append("\n---") + + return "\n".join(formatted_text) + + def create_document(self) -> str: + document = [ + "# 📑 文档总结报告", + "\n## 总体摘要", + self.final_summary + ] + + if self.failed_files: + document.append(self.format_failed_files()) + + document.extend([ + "\n# 📚 各文件详细总结", + self.format_file_summaries() + ]) + + return "\n".join(document) + + +class HtmlFormatter(DocumentFormatter): + """HTML格式文档生成器""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.css_styles = """ + body { + font-family: "Microsoft YaHei", Arial, sans-serif; + line-height: 1.6; + max-width: 1000px; + margin: 0 auto; + padding: 20px; + color: #333; + } + h1 { + color: #2c3e50; + border-bottom: 2px solid #eee; + padding-bottom: 10px; + font-size: 24px; + text-align: center; + } + h2 { + color: #34495e; + margin-top: 30px; + font-size: 20px; + border-left: 4px solid #3498db; + padding-left: 10px; + } + h3 { + color: #2c3e50; + font-size: 18px; + margin-top: 20px; + } + .summary { + background-color: #f8f9fa; + padding: 20px; + border-radius: 5px; + margin: 20px 0; + box-shadow: 0 2px 4px rgba(0,0,0,0.1); + } + .details { + margin-top: 40px; + } + .failed-files { + background-color: #fff3f3; + padding: 15px; + border-left: 4px solid #e74c3c; + margin: 20px 0; + } + .file-summary { + background-color: #fff; + padding: 15px; + margin: 15px 0; + border-radius: 4px; + box-shadow: 0 1px 3px rgba(0,0,0,0.1); + } + """ + + def format_failed_files(self) -> str: + if not self.failed_files: + return "" + + failed_files_html = ['
'] + failed_files_html.append("

⚠️ 处理失败的文件

") + failed_files_html.append("
") + return "\n".join(failed_files_html) + + def format_file_summaries(self) -> str: + formatted_html = [] + sorted_paths = sorted(self.file_summaries_map.keys()) + current_dir = "" + + for path in sorted_paths: + dir_path = os.path.dirname(path) + if dir_path != current_dir: + if dir_path: + formatted_html.append(f'

📁 {dir_path}

') + current_dir = dir_path + + file_name = os.path.basename(path) + formatted_html.append('
') + formatted_html.append(f'

📄 {file_name}

') + formatted_html.append(f'

{self.file_summaries_map[path]}

') + formatted_html.append('
') + + return "\n".join(formatted_html) + + def create_document(self) -> str: + return f""" + + + + + 文档总结报告 + + + +

📑 文档总结报告

+

总体摘要

+
{self.final_summary}
+ {self.format_failed_files()} +
+

📚 各文件详细总结

+ {self.format_file_summaries()} +
+ + + """ + + diff --git a/crazy_functions/rag_fns/llama_index_worker.py b/crazy_functions/rag_fns/llama_index_worker.py index f6f7f0ab2a..59a5827cff 100644 --- a/crazy_functions/rag_fns/llama_index_worker.py +++ b/crazy_functions/rag_fns/llama_index_worker.py @@ -1,17 +1,13 @@ -import llama_index -import os import atexit from loguru import logger from typing import List + from llama_index.core import Document +from llama_index.core.ingestion import run_transformations from llama_index.core.schema import TextNode -from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel -from shared_utils.connect_void_terminal import get_chat_default_kwargs -from llama_index.core import VectorStoreIndex, SimpleDirectoryReader + from crazy_functions.rag_fns.vector_store_index import GptacVectorStoreIndex -from llama_index.core.ingestion import run_transformations -from llama_index.core import PromptTemplate -from llama_index.core.response_synthesizers import TreeSummarize +from request_llms.embed_models.openai_embed import OpenAiEmbeddingModel DEFAULT_QUERY_GENERATION_PROMPT = """\ Now, you have context information as below: @@ -63,7 +59,7 @@ def create_new_vs(self): def purge(self): import shutil shutil.rmtree(self.checkpoint_dir, ignore_errors=True) - self.vs_index = self.create_new_vs() + self.vs_index = self.create_new_vs(self.checkpoint_dir) class LlamaIndexRagWorker(SaveLoad): @@ -75,7 +71,7 @@ def __init__(self, user_name, llm_kwargs, auto_load_checkpoint=True, checkpoint_ if auto_load_checkpoint: self.vs_index = self.load_from_checkpoint(checkpoint_dir) else: - self.vs_index = self.create_new_vs(checkpoint_dir) + self.vs_index = self.create_new_vs() atexit.register(lambda: self.save_to_checkpoint(checkpoint_dir)) def assign_embedding_model(self): @@ -91,40 +87,52 @@ def inspect_vector_store(self): logger.info('oo --------inspect_vector_store end--------') return vector_store_preview - def add_documents_to_vector_store(self, document_list): - documents = [Document(text=t) for t in document_list] + def add_documents_to_vector_store(self, document_list: List[Document]): + """ + Adds a list of Document objects to the vector store after processing. + """ + documents = document_list documents_nodes = run_transformations( - documents, # type: ignore - self.vs_index._transformations, - show_progress=True - ) + documents, # type: ignore + self.vs_index._transformations, + show_progress=True + ) self.vs_index.insert_nodes(documents_nodes) - if self.debug_mode: self.inspect_vector_store() + if self.debug_mode: + self.inspect_vector_store() - def add_text_to_vector_store(self, text): + def add_text_to_vector_store(self, text: str): node = TextNode(text=text) documents_nodes = run_transformations( - [node], - self.vs_index._transformations, - show_progress=True - ) + [node], + self.vs_index._transformations, + show_progress=True + ) self.vs_index.insert_nodes(documents_nodes) - if self.debug_mode: self.inspect_vector_store() + if self.debug_mode: + self.inspect_vector_store() def remember_qa(self, question, answer): formatted_str = QUESTION_ANSWER_RECORD.format(question=question, answer=answer) self.add_text_to_vector_store(formatted_str) def retrieve_from_store_with_query(self, query): - if self.debug_mode: self.inspect_vector_store() + if self.debug_mode: + self.inspect_vector_store() retriever = self.vs_index.as_retriever() return retriever.retrieve(query) def build_prompt(self, query, nodes): context_str = self.generate_node_array_preview(nodes) return DEFAULT_QUERY_GENERATION_PROMPT.format(context_str=context_str, query_str=query) - + def generate_node_array_preview(self, nodes): buf = "\n".join(([f"(No.{i+1} | score {n.score:.3f}): {n.text}" for i, n in enumerate(nodes)])) if self.debug_mode: logger.info(buf) return buf + + def purge_vector_store(self): + """ + Purges the current vector store and creates a new one. + """ + self.purge() \ No newline at end of file diff --git a/crazy_functions/rag_fns/rag_file_support.py b/crazy_functions/rag_fns/rag_file_support.py new file mode 100644 index 0000000000..f826fab1b9 --- /dev/null +++ b/crazy_functions/rag_fns/rag_file_support.py @@ -0,0 +1,45 @@ +import os +from llama_index.core import SimpleDirectoryReader + +supports_format = ['.csv', '.docx','.doc', '.epub', '.ipynb', '.mbox', '.md', '.pdf', '.txt', '.ppt', + '.pptm', '.pptx','.py', '.xls', '.xlsx', '.html', '.json', '.xml', '.yaml', '.yml' ,'.m'] + +def read_docx_doc(file_path): + if file_path.split(".")[-1] == "docx": + from docx import Document + doc = Document(file_path) + file_content = "\n".join([para.text for para in doc.paragraphs]) + else: + try: + import win32com.client + word = win32com.client.Dispatch("Word.Application") + word.visible = False + # 打开文件 + doc = word.Documents.Open(os.getcwd() + '/' + file_path) + # file_content = doc.Content.Text + doc = word.ActiveDocument + file_content = doc.Range().Text + doc.Close() + word.Quit() + except: + raise RuntimeError('请先将.doc文档转换为.docx文档。') + return file_content + +# 修改后的 extract_text 函数,结合 SimpleDirectoryReader 和自定义解析逻辑 +import os + +def extract_text(file_path): + _, ext = os.path.splitext(file_path.lower()) + + # 使用 SimpleDirectoryReader 处理它支持的文件格式 + if ext in ['.docx', '.doc']: + return read_docx_doc(file_path) + try: + reader = SimpleDirectoryReader(input_files=[file_path]) + documents = reader.load_data() + if len(documents) > 0: + return documents[0].text + except Exception as e: + pass + + return None diff --git "a/crazy_functions/\346\200\273\347\273\223word\346\226\207\346\241\243.py" "b/crazy_functions/\346\200\273\347\273\223word\346\226\207\346\241\243.py" deleted file mode 100644 index 99f0919bf7..0000000000 --- "a/crazy_functions/\346\200\273\347\273\223word\346\226\207\346\241\243.py" +++ /dev/null @@ -1,127 +0,0 @@ -from toolbox import update_ui -from toolbox import CatchException, report_exception -from toolbox import write_history_to_file, promote_file_to_downloadzone -from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive -fast_debug = False - - -def 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): - import time, os - # pip install python-docx 用于docx格式,跨平台 - # pip install pywin32 用于doc格式,仅支持Win平台 - for index, fp in enumerate(file_manifest): - if fp.split(".")[-1] == "docx": - from docx import Document - doc = Document(fp) - file_content = "\n".join([para.text for para in doc.paragraphs]) - else: - try: - import win32com.client - word = win32com.client.Dispatch("Word.Application") - word.visible = False - # 打开文件 - doc = word.Documents.Open(os.getcwd() + '/' + fp) - # file_content = doc.Content.Text - doc = word.ActiveDocument - file_content = doc.Range().Text - doc.Close() - word.Quit() - except: - raise RuntimeError('请先将.doc文档转换为.docx文档。') - - # private_upload里面的文件名在解压zip后容易出现乱码(rar和7z格式正常),故可以只分析文章内容,不输入文件名 - from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit - from request_llms.bridge_all import model_info - max_token = model_info[llm_kwargs['llm_model']]['max_token'] - TOKEN_LIMIT_PER_FRAGMENT = max_token * 3 // 4 - paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) - this_paper_history = [] - for i, paper_frag in enumerate(paper_fragments): - i_say = f'请对下面的文章片段用中文做概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{paper_frag}```' - i_say_show_user = f'请对下面的文章片段做概述: {os.path.abspath(fp)}的第{i+1}/{len(paper_fragments)}个片段。' - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs=i_say, - inputs_show_user=i_say_show_user, - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history=[], - sys_prompt="总结文章。" - ) - - chatbot[-1] = (i_say_show_user, gpt_say) - history.extend([i_say_show_user,gpt_say]) - this_paper_history.extend([i_say_show_user,gpt_say]) - - # 已经对该文章的所有片段总结完毕,如果文章被切分了, - if len(paper_fragments) > 1: - i_say = f"根据以上的对话,总结文章{os.path.abspath(fp)}的主要内容。" - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs=i_say, - inputs_show_user=i_say, - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history=this_paper_history, - sys_prompt="总结文章。" - ) - - history.extend([i_say,gpt_say]) - this_paper_history.extend([i_say,gpt_say]) - - res = write_history_to_file(history) - promote_file_to_downloadzone(res, chatbot=chatbot) - chatbot.append(("完成了吗?", res)) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - res = write_history_to_file(history) - promote_file_to_downloadzone(res, chatbot=chatbot) - chatbot.append(("所有文件都总结完成了吗?", res)) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - -@CatchException -def 总结word文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): - import glob, os - - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "批量总结Word文档。函数插件贡献者: JasonGuo1。注意, 如果是.doc文件, 请先转化为.docx格式。"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - from docx import Document - except: - report_exception(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade python-docx pywin32```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # 清空历史,以免输入溢出 - history = [] - - # 检测输入参数,如没有给定输入参数,直接退出 - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # 搜索需要处理的文件清单 - if txt.endswith('.docx') or txt.endswith('.doc'): - file_manifest = [txt] - else: - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.docx', recursive=True)] + \ - [f for f in glob.glob(f'{project_folder}/**/*.doc', recursive=True)] - - # 如果没找到任何文件 - if len(file_manifest) == 0: - report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到任何.docx或doc文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # 开始正式执行任务 - yield from 解析docx(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) diff --git "a/crazy_functions/\346\211\271\351\207\217\346\226\207\344\273\266\350\257\242\351\227\256.py" "b/crazy_functions/\346\211\271\351\207\217\346\226\207\344\273\266\350\257\242\351\227\256.py" new file mode 100644 index 0000000000..b95c9fcb35 --- /dev/null +++ "b/crazy_functions/\346\211\271\351\207\217\346\226\207\344\273\266\350\257\242\351\227\256.py" @@ -0,0 +1,496 @@ +import os +import threading +import time +from dataclasses import dataclass +from typing import List, Tuple, Dict, Generator + +from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency +from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit +from crazy_functions.rag_fns.rag_file_support import extract_text +from request_llms.bridge_all import model_info +from toolbox import update_ui, CatchException, report_exception + + +@dataclass +class FileFragment: + """文件片段数据类,用于组织处理单元""" + file_path: str + content: str + rel_path: str + fragment_index: int + total_fragments: int + + +class BatchDocumentSummarizer: + """优化的文档总结器 - 批处理版本""" + + def __init__(self, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, history: List, system_prompt: str): + """初始化总结器""" + self.llm_kwargs = llm_kwargs + self.plugin_kwargs = plugin_kwargs + self.chatbot = chatbot + self.history = history + self.system_prompt = system_prompt + self.failed_files = [] + self.file_summaries_map = {} + + def _get_token_limit(self) -> int: + """获取模型token限制""" + max_token = model_info[self.llm_kwargs['llm_model']]['max_token'] + return max_token * 3 // 4 + + def _create_batch_inputs(self, fragments: List[FileFragment]) -> Tuple[List, List, List]: + """创建批处理输入""" + inputs_array = [] + inputs_show_user_array = [] + history_array = [] + + for frag in fragments: + if self.plugin_kwargs.get("advanced_arg"): + i_say = (f'请按照用户要求对文件内容进行处理,文件名为{os.path.basename(frag.file_path)},' + f'用户要求为:{self.plugin_kwargs["advanced_arg"]}:' + f'文件内容是 ```{frag.content}```') + i_say_show_user = (f'正在处理 {frag.rel_path} (片段 {frag.fragment_index + 1}/{frag.total_fragments})') + else: + i_say = (f'请对下面的内容用中文做总结,不超过500字,文件名是{os.path.basename(frag.file_path)},' + f'内容是 ```{frag.content}```') + i_say_show_user = f'正在处理 {frag.rel_path} (片段 {frag.fragment_index + 1}/{frag.total_fragments})' + + inputs_array.append(i_say) + inputs_show_user_array.append(i_say_show_user) + history_array.append([]) + + return inputs_array, inputs_show_user_array, history_array + + def _process_single_file_with_timeout(self, file_info: Tuple[str, str], mutable_status: List) -> List[FileFragment]: + """包装了超时控制的文件处理函数""" + + def timeout_handler(): + thread = threading.current_thread() + if hasattr(thread, '_timeout_occurred'): + thread._timeout_occurred = True + + # 设置超时标记 + thread = threading.current_thread() + thread._timeout_occurred = False + + # 设置超时定时器 + timer = threading.Timer(self.watch_dog_patience, timeout_handler) + timer.start() + + try: + fp, project_folder = file_info + fragments = [] + + # 定期检查是否超时 + def check_timeout(): + if hasattr(thread, '_timeout_occurred') and thread._timeout_occurred: + raise TimeoutError("处理超时") + + # 更新状态 + mutable_status[0] = "检查文件大小" + mutable_status[1] = time.time() + check_timeout() + + # 文件大小检查 + if os.path.getsize(fp) > self.max_file_size: + self.failed_files.append((fp, f"文件过大:超过{self.max_file_size / 1024 / 1024}MB")) + mutable_status[2] = "文件过大" + return fragments + + check_timeout() + + # 更新状态 + mutable_status[0] = "提取文件内容" + mutable_status[1] = time.time() + + # 提取内容 + content = extract_text(fp) + if content is None: + self.failed_files.append((fp, "文件解析失败:不支持的格式或文件损坏")) + mutable_status[2] = "格式不支持" + return fragments + elif not content.strip(): + self.failed_files.append((fp, "文件内容为空")) + mutable_status[2] = "内容为空" + return fragments + + check_timeout() + + # 更新状态 + mutable_status[0] = "分割文本" + mutable_status[1] = time.time() + + # 分割文本 + try: + paper_fragments = breakdown_text_to_satisfy_token_limit( + txt=content, + limit=self._get_token_limit(), + llm_model=self.llm_kwargs['llm_model'] + ) + except Exception as e: + self.failed_files.append((fp, f"文本分割失败:{str(e)}")) + mutable_status[2] = "分割失败" + return fragments + + check_timeout() + + # 处理片段 + rel_path = os.path.relpath(fp, project_folder) + for i, frag in enumerate(paper_fragments): + if frag.strip(): + fragments.append(FileFragment( + file_path=fp, + content=frag, + rel_path=rel_path, + fragment_index=i, + total_fragments=len(paper_fragments) + )) + + mutable_status[2] = "处理完成" + return fragments + + except TimeoutError as e: + self.failed_files.append((fp, "处理超时")) + mutable_status[2] = "处理超时" + return [] + except Exception as e: + self.failed_files.append((fp, f"处理失败:{str(e)}")) + mutable_status[2] = "处理异常" + return [] + finally: + timer.cancel() + + def prepare_fragments(self, project_folder: str, file_paths: List[str]) -> Generator: + import concurrent.futures + + + from concurrent.futures import ThreadPoolExecutor + from typing import Generator, List + """并行准备所有文件的处理片段""" + all_fragments = [] + total_files = len(file_paths) + + # 配置参数 + self.refresh_interval = 0.2 # UI刷新间隔 + self.watch_dog_patience = 5 # 看门狗超时时间 + self.max_file_size = 10 * 1024 * 1024 # 10MB限制 + self.max_workers = min(32, len(file_paths)) # 最多32个线程 + + # 创建有超时控制的线程池 + executor = ThreadPoolExecutor(max_workers=self.max_workers) + + # 用于跨线程状态传递的可变列表 - 增加文件名信息 + mutable_status_array = [["等待中", time.time(), "pending", file_path] for file_path in file_paths] + + # 创建文件处理任务 + file_infos = [(fp, project_folder) for fp in file_paths] + + # 提交所有任务,使用带超时控制的处理函数 + futures = [ + executor.submit( + self._process_single_file_with_timeout, + file_info, + mutable_status_array[i] + ) for i, file_info in enumerate(file_infos) + ] + + # 更新UI的计数器 + cnt = 0 + + try: + # 监控任务执行 + while True: + time.sleep(self.refresh_interval) + cnt += 1 + + # 检查任务完成状态 + worker_done = [f.done() for f in futures] + + # 更新状态显示 + status_str = "" + for i, (status, timestamp, desc, file_path) in enumerate(mutable_status_array): + # 获取文件名(去掉路径) + file_name = os.path.basename(file_path) + if worker_done[i]: + status_str += f"文件 {file_name}: {desc}\n" + else: + status_str += f"文件 {file_name}: {status} {desc}\n" + + # 更新UI + self.chatbot[-1] = [ + "处理进度", + f"正在处理文件...\n\n{status_str}" + "." * (cnt % 10 + 1) + ] + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 检查是否所有任务完成 + if all(worker_done): + break + + finally: + # 确保线程池正确关闭 + executor.shutdown(wait=False) + + # 收集结果 + processed_files = 0 + for future in futures: + try: + fragments = future.result(timeout=0.1) # 给予一个短暂的超时时间来获取结果 + all_fragments.extend(fragments) + processed_files += 1 + except concurrent.futures.TimeoutError: + # 处理获取结果超时 + file_index = futures.index(future) + self.failed_files.append((file_paths[file_index], "结果获取超时")) + continue + except Exception as e: + # 处理其他异常 + file_index = futures.index(future) + self.failed_files.append((file_paths[file_index], f"未知错误:{str(e)}")) + continue + + # 最终进度更新 + self.chatbot.append([ + "文件处理完成", + f"成功处理 {len(all_fragments)} 个片段,失败 {len(self.failed_files)} 个文件" + ]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + return all_fragments + + def _process_fragments_batch(self, fragments: List[FileFragment]) -> Generator: + """批量处理文件片段""" + from collections import defaultdict + batch_size = 64 # 每批处理的片段数 + max_retries = 3 # 最大重试次数 + retry_delay = 5 # 重试延迟(秒) + results = defaultdict(list) + + # 按批次处理 + for i in range(0, len(fragments), batch_size): + batch = fragments[i:i + batch_size] + + inputs_array, inputs_show_user_array, history_array = self._create_batch_inputs(batch) + sys_prompt_array = ["请总结以下内容:"] * len(batch) + + # 添加重试机制 + for retry in range(max_retries): + try: + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=inputs_array, + inputs_show_user_array=inputs_show_user_array, + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=history_array, + sys_prompt_array=sys_prompt_array, + ) + + # 处理响应 + for j, frag in enumerate(batch): + summary = response_collection[j * 2 + 1] + if summary and summary.strip(): + results[frag.rel_path].append({ + 'index': frag.fragment_index, + 'summary': summary, + 'total': frag.total_fragments + }) + break # 成功处理,跳出重试循环 + + except Exception as e: + if retry == max_retries - 1: # 最后一次重试失败 + for frag in batch: + self.failed_files.append((frag.file_path, f"处理失败:{str(e)}")) + else: + yield from update_ui(self.chatbot.append([f"批次处理失败,{retry_delay}秒后重试...", str(e)])) + time.sleep(retry_delay) + + return results + + def _generate_final_summary_request(self) -> Tuple[List, List, List]: + """准备最终总结请求""" + if not self.file_summaries_map: + return (["无可用的文件总结"], ["生成最终总结"], [[]]) + + summaries = list(self.file_summaries_map.values()) + if all(not summary for summary in summaries): + return (["所有文件处理均失败"], ["生成最终总结"], [[]]) + + if self.plugin_kwargs.get("advanced_arg"): + i_say = "根据以上所有文件的处理结果,按要求进行综合处理:" + self.plugin_kwargs['advanced_arg'] + else: + i_say = "请根据以上所有文件的处理结果,生成最终的总结,不超过1000字。" + + return ([i_say], [i_say], [summaries]) + + def process_files(self, project_folder: str, file_paths: List[str]) -> Generator: + """处理所有文件""" + total_files = len(file_paths) + self.chatbot.append([f"开始处理", f"总计 {total_files} 个文件"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 1. 准备所有文件片段 + # 在 process_files 函数中: + fragments = yield from self.prepare_fragments(project_folder, file_paths) + if not fragments: + self.chatbot.append(["处理失败", "没有可处理的文件内容"]) + return "没有可处理的文件内容" + + # 2. 批量处理所有文件片段 + self.chatbot.append([f"文件分析", f"共计 {len(fragments)} 个处理单元"]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + try: + file_summaries = yield from self._process_fragments_batch(fragments) + except Exception as e: + self.chatbot.append(["处理错误", f"批处理过程失败:{str(e)}"]) + return "处理过程发生错误" + + # 3. 为每个文件生成整体总结 + self.chatbot.append(["生成总结", "正在汇总文件内容..."]) + yield from update_ui(chatbot=self.chatbot, history=self.history) + + # 处理每个文件的总结 + for rel_path, summaries in file_summaries.items(): + if len(summaries) > 1: # 多片段文件需要生成整体总结 + sorted_summaries = sorted(summaries, key=lambda x: x['index']) + if self.plugin_kwargs.get("advanced_arg"): + + i_say = f'请按照用户要求对文件内容进行处理,用户要求为:{self.plugin_kwargs["advanced_arg"]}:' + else: + i_say = f"请总结文件 {os.path.basename(rel_path)} 的主要内容,不超过500字。" + + try: + summary_texts = [s['summary'] for s in sorted_summaries] + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=[i_say], + inputs_show_user_array=[f"生成 {rel_path} 的处理结果"], + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=[summary_texts], + sys_prompt_array=["你是一个优秀的助手,"], + ) + self.file_summaries_map[rel_path] = response_collection[1] + except Exception as e: + self.chatbot.append(["警告", f"文件 {rel_path} 总结生成失败:{str(e)}"]) + self.file_summaries_map[rel_path] = "总结生成失败" + else: # 单片段文件直接使用其唯一的总结 + self.file_summaries_map[rel_path] = summaries[0]['summary'] + + # 4. 生成最终总结 + if total_files ==1: + return "文件数为1,此时不调用总结模块" + else: + try: + # 收集所有文件的总结用于生成最终总结 + file_summaries_for_final = [] + for rel_path, summary in self.file_summaries_map.items(): + file_summaries_for_final.append(f"文件 {rel_path} 的总结:\n{summary}") + + if self.plugin_kwargs.get("advanced_arg"): + final_summary_prompt = ("根据以下所有文件的总结内容,按要求进行综合处理:" + + self.plugin_kwargs['advanced_arg']) + else: + final_summary_prompt = "请根据以下所有文件的总结内容,生成最终的总结报告。" + + response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( + inputs_array=[final_summary_prompt], + inputs_show_user_array=["生成最终总结报告"], + llm_kwargs=self.llm_kwargs, + chatbot=self.chatbot, + history_array=[file_summaries_for_final], + sys_prompt_array=["总结所有文件内容。"], + max_workers=1 + ) + + return response_collection[1] if len(response_collection) > 1 else "生成总结失败" + except Exception as e: + self.chatbot.append(["错误", f"最终总结生成失败:{str(e)}"]) + return "生成总结失败" + + def save_results(self, final_summary: str): + """保存结果到文件""" + from toolbox import promote_file_to_downloadzone, write_history_to_file + from crazy_functions.doc_fns.batch_file_query_doc import MarkdownFormatter, HtmlFormatter, WordFormatter + import os + timestamp = time.strftime("%Y%m%d_%H%M%S") + + # 创建各种格式化器 + md_formatter = MarkdownFormatter(final_summary, self.file_summaries_map, self.failed_files) + html_formatter = HtmlFormatter(final_summary, self.file_summaries_map, self.failed_files) + word_formatter = WordFormatter(final_summary, self.file_summaries_map, self.failed_files) + + result_files = [] + + # 保存 Markdown + md_content = md_formatter.create_document() + result_file_md = write_history_to_file( + history=[md_content], # 直接传入内容列表 + file_basename=f"文档总结_{timestamp}.md" + ) + result_files.append(result_file_md) + + # 保存 HTML + html_content = html_formatter.create_document() + result_file_html = write_history_to_file( + history=[html_content], + file_basename=f"文档总结_{timestamp}.html" + ) + result_files.append(result_file_html) + + # 保存 Word + doc = word_formatter.create_document() + # 由于 Word 文档需要用 doc.save(),我们使用与 md 文件相同的目录 + result_file_docx = os.path.join( + os.path.dirname(result_file_md), + f"文档总结_{timestamp}.docx" + ) + doc.save(result_file_docx) + result_files.append(result_file_docx) + + # 添加到下载区 + for file in result_files: + promote_file_to_downloadzone(file, chatbot=self.chatbot) + + self.chatbot.append(["处理完成", f"结果已保存至: {', '.join(result_files)}"]) +@CatchException +def 批量文件询问(txt: str, llm_kwargs: Dict, plugin_kwargs: Dict, chatbot: List, + history: List, system_prompt: str, user_request: str): + """主函数 - 优化版本""" + # 初始化 + import glob + import re + from crazy_functions.rag_fns.rag_file_support import supports_format + from toolbox import report_exception + + summarizer = BatchDocumentSummarizer(llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) + chatbot.append(["函数插件功能", f"作者:lbykkkk,批量总结文件。支持格式: {', '.join(supports_format)}等其他文本格式文件,如果长时间卡在文件处理过程,请查看处理进度,然后删除所有处于“pending”状态的文件,然后重新上传处理。"]) + yield from update_ui(chatbot=chatbot, history=history) + + # 验证输入路径 + if not os.path.exists(txt): + report_exception(chatbot, history, a=f"解析项目: {txt}", b=f"找不到项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 获取文件列表 + project_folder = txt + extract_folder = next((d for d in glob.glob(f'{project_folder}/*') + if os.path.isdir(d) and d.endswith('.extract')), project_folder) + + exclude_patterns = r'/[^/]+\.(zip|rar|7z|tar|gz)$' + file_manifest = [f for f in glob.glob(f'{extract_folder}/**', recursive=True) + if os.path.isfile(f) and not re.search(exclude_patterns, f)] + + if not file_manifest: + report_exception(chatbot, history, a=f"解析项目: {txt}", b="未找到支持的文件类型") + yield from update_ui(chatbot=chatbot, history=history) + return + + # 处理所有文件并生成总结 + final_summary = yield from summarizer.process_files(project_folder, file_manifest) + yield from update_ui(chatbot=chatbot, history=history) + + # 保存结果 + summarizer.save_results(final_summary) + yield from update_ui(chatbot=chatbot, history=history) \ No newline at end of file