# AI Module Version 2.2 - Improved Office file processing with best practices
import os
from pathlib import Path
from typing import List, Literal, Optional, Callable, Union
from logger import logger
import re
import tempfile
import time
import threading
from concurrent.futures import ThreadPoolExecutor, TimeoutError

# 核心文档处理库 - 行业标准最佳实践
import PyPDF2  # PDF处理
import docx  # DOCX文件处理
import pptx  # PPTX文件处理
from bs4 import BeautifulSoup  # HTML处理
import openpyxl  # 现代Excel文件处理

# 添加跨平台兼容的文档处理库
try:
    import pandas as pd  # Excel文件的通用处理方案
    HAS_PANDAS = True
    logger.info("pandas 导入成功 - 支持高质量Excel文件处理")
except ImportError:
    HAS_PANDAS = False
    logger.warning("pandas 未安装，Excel文件处理将使用基础方案")

try:
    import docx2txt  # 跨平台DOC/DOCX文本提取
    HAS_DOCX2TXT = True
    logger.info("docx2txt 导入成功 - 支持跨平台DOC文件处理")
except ImportError:
    HAS_DOCX2TXT = False
    logger.warning("docx2txt 未安装，DOC文件处理能力受限")

try:
    import xlrd  # 旧版Excel .xls文件处理
    HAS_XLRD = True
    logger.info("xlrd 导入成功 - 支持旧版XLS文件处理")
except ImportError:
    HAS_XLRD = False
    logger.warning("xlrd 未安装，旧版XLS文件处理能力受限")

# 添加更多纯Python文档处理库支持
try:
    import extract_msg  # 处理Outlook MSG文件
    HAS_EXTRACT_MSG = True
    logger.info("extract-msg 导入成功 - 支持Outlook MSG文件处理")
except ImportError:
    HAS_EXTRACT_MSG = False

try:
    import odfpy  # 处理OpenDocument格式
    HAS_ODFPY = True
    logger.info("odfpy 导入成功 - 支持OpenDocument格式")
except ImportError:
    HAS_ODFPY = False

try:
    import python_calamine  # 高性能Excel处理
    HAS_CALAMINE = True
    logger.info("python-calamine 导入成功 - 高性能Excel处理")
except ImportError:
    HAS_CALAMINE = False
# 添加langchain unstructured image loader
try:
    from langchain_community.document_loaders import UnstructuredImageLoader
    HAS_UNSTRUCTURED = True
    logger.info("UnstructuredImageLoader 导入成功")
except ImportError as e:
    HAS_UNSTRUCTURED = False
    logger.warning(f"UnstructuredImageLoader 导入失败: {str(e)}")
    logger.info("建议执行: pip install unstructured[all-docs] 或 pip install --upgrade unstructured")
except Exception as e:
    HAS_UNSTRUCTURED = False
    error_msg = str(e).lower()
    logger.warning(f"UnstructuredImageLoader 导入时发生未预期错误: {str(e)}")
    
    if "pi_heif" in error_msg:
        logger.info("检测到缺少图片格式支持库")
        logger.info("快速解决方案: pip install pillow-heif")
    elif "pdfminer" in error_msg or "open_filename" in error_msg:
        logger.info("检测到pdfminer版本冲突")
        logger.info("快速解决方案: pip install pdfminer.six==20211012")
    else:
        logger.info("建议: pip install unstructured[all-docs] --upgrade")
try:
    # 尝试导入win32com，用于处理旧版Office文件
    import win32com.client
    import pythoncom
    HAS_WIN32COM = True
except ImportError:
    HAS_WIN32COM = False
try:
    # 尝试导入antiword，用于处理旧版Word .doc文件
    import subprocess
    HAS_ANTIWORD = subprocess.call(['which', 'antiword'], stdout=subprocess.PIPE) == 0
except:
    HAS_ANTIWORD = False
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain.schema import Document, SystemMessage, HumanMessage
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import requests
import base64

# OpenCV图片处理（可选）
try:
    import cv2
    import numpy as np
    HAS_OPENCV = True
    logger.info("OpenCV 导入成功 - 支持高级图片预处理")
except ImportError:
    HAS_OPENCV = False
    logger.info("OpenCV 未安装，将使用基础图片处理功能")
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import threading
import time
import json
import re
import tempfile
import hashlib
from collections import defaultdict
import queue
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
from queue import Queue
import logging
import signal
import gc

# Qt imports for tree widget display




global processed_count
processed_count = 0

# 定义类型别名
ImageRecognitionMethod = Literal["tesseract", "llava", "both", "kimi", "deepseek"]
LLMType = Literal["openai", "ollama"]

class FileLoadTimeoutException(Exception):
    """文件加载超时异常"""
    pass

# 添加看门狗类
class WatchdogTimer:
    """看门狗定时器，用于监控函数执行时间"""
    def __init__(self, timeout: float, callback: Callable = None):
        self.timeout = timeout
        self.callback = callback
        self.timer = None
        self.is_active = False
    
    def _timeout_handler(self):
        """超时处理"""
        self.is_active = False
        if self.callback:
            self.callback()
        else:
            logger.error(f"看门狗超时 ({self.timeout}秒)")
    
    def start(self):
        """启动看门狗"""
        if self.timer:
            self.timer.cancel()
        self.is_active = True
        self.timer = threading.Timer(self.timeout, self._timeout_handler)
        self.timer.start()
    
    def stop(self):
        """停止看门狗"""
        self.is_active = False
        if self.timer:
            self.timer.cancel()
            self.timer = None

# 🚀 优化：添加超时装饰器，缩短默认超时时间
def timeout_decorator(timeout_seconds: int = 30):
    """超时装饰器，强制函数在指定时间内完成（优化版）"""
    def decorator(func):
        def wrapper(*args, **kwargs):
            result_queue = queue.Queue()
            exception_queue = queue.Queue()
            
            def target():
                try:
                    result = func(*args, **kwargs)
                    result_queue.put(result)
                except Exception as e:
                    exception_queue.put(e)
            
            thread = threading.Thread(target=target)
            thread.daemon = True
            thread.start()
            
            # 等待结果或超时
            thread.join(timeout=timeout_seconds)
            
            if thread.is_alive():
                # 强制结束线程
                logger.error(f"函数 {func.__name__} 执行超时 ({timeout_seconds}秒)")
                # 强制垃圾回收，清理可能的内存泄漏
                gc.collect()
                raise TimeoutError(f"函数执行超时 ({timeout_seconds}秒)")
            
            # 检查是否有异常
            if not exception_queue.empty():
                raise exception_queue.get()
            
            # 检查是否有结果
            if not result_queue.empty():
                return result_queue.get()
            
            # 如果没有结果也没有异常，说明函数可能没有正常返回
            raise RuntimeError(f"函数 {func.__name__} 未正常返回结果")
        
        return wrapper
    return decorator

# 安全的LLM调用函数将在FileClassification类定义后定义

def get_dynamic_timeout(file_path: str) -> int:
    """根据文件类型和大小动态计算超时时间
    
    Args:
        file_path: 文件路径
        
    Returns:
        int: 建议的超时时间（秒）
    """
    try:
        file_extension = Path(file_path).suffix.lower()
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
        
        # 🚀 优化：合理设置超时时间，平衡速度和成功率
        base_timeout = {
            '.txt': 3, '.md': 3, '.rtf': 5,           # 文本文件
            '.docx': 8, '.doc': 12,                   # Word文档  
            '.xlsx': 8, '.xls': 12,                   # Excel文档
            '.pptx': 10, '.ppt': 15,                  # PowerPoint文档
            '.pdf': 12,                               # PDF文档
            '.jpg': 20, '.jpeg': 20, '.png': 18,      # 图片文件（增加基础超时）
            '.bmp': 25, '.tiff': 30, '.gif': 12, '.webp': 18, '.svg': 5
        }
        
        timeout = base_timeout.get(file_extension, 10)
        
        # 🚀 优化：图片文件给予足够的OCR识别时间
        if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
            # 高分辨率图片OCR识别可能需要较长时间
            timeout += 15  # 多模态AI/OCR额外时间
        
        # 🚀 优化：更保守的文件大小调整
        if file_size > 50:      # 大于50MB
            timeout *= 1.8      # 从2.5减少到1.8
        elif file_size > 10:    # 大于10MB  
            timeout *= 1.4      # 从1.8减少到1.4
        elif file_size > 5:     # 大于5MB
            timeout *= 1.2      # 从1.3减少到1.2
        
        # 🚀 优化：大幅缩短最大超时时间
        min_timeout = 8 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'] else 5
        max_timeout = 45 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'] else 30
        
        return max(min_timeout, min(max_timeout, int(timeout)))
        
    except Exception as e:
        logger.warning(f"计算文件 {file_path} 超时时间失败: {str(e)}")
        # 🚀 优化：减少默认超时时间
        if Path(file_path).suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
            return 25  # 从90秒减少到25秒
        return 12      # 从20秒减少到12秒


def load_document_with_timeout(file_path: str, max_chars: int = 5000, api_key: str = '', 
                              is_local: bool = False, image_recognition_method: ImageRecognitionMethod = "tesseract", 
                              timeout: int = 30, use_dynamic_timeout: bool = True, online_model: str = None,
                              api_base: str = None, model_name: str = None, moonshot_key: Optional[str] = None):
    """带超时的文档加载函数
    
    Args:
        file_path: 文件路径
        max_chars: 最大字符数
        api_key: API密钥
        is_local: 是否本地模型
        image_recognition_method: 图片识别方法
        timeout: 超时时间（秒），默认120秒
        use_dynamic_timeout: 是否使用动态超时计算
        moonshot_key: Moonshot API密钥（用于文件解析）
        
    Returns:
        文档列表或在超时时返回错误文档
        
    Raises:
        FileLoadTimeoutException: 加载超时
    """
    # 如果启用动态超时，使用计算的值
    if use_dynamic_timeout:
        timeout = get_dynamic_timeout(file_path)
        logger.debug(f"文件 {file_path} 使用动态超时: {timeout}秒")
    
    def target_function():
        return load_document(file_path, max_chars, api_key, is_local, image_recognition_method, online_model, api_base, model_name, moonshot_key)  # 🚀 传递 moonshot_key
    
    # 使用线程执行加载函数
    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(target_function)
        try:
            logger.info("开始识别文件 {0}".format(file_path))
            result = future.result(timeout=timeout)
            logger.info(result)
            return result
        except TimeoutError:
            # 检查任务是否仍在运行
            if not future.done():
                logger.warning(f"文件 {file_path} 加载超时（{timeout}秒），但任务仍在执行中，尝试等待更多时间...")
                
                # 🚀 优化：给异步任务更少的额外时间，快速失败
                extended_timeout = 15  # 从60秒减少到15秒
                try:
                    result = future.result(timeout=extended_timeout)
                    logger.info(f"延长等待后成功获取结果: {file_path}")
                    return result
                except TimeoutError:
                    logger.error(f"文件 {file_path} 最终超时，快速失败避免卡死")
                    # 🔥 重要：不要调用 future.cancel()！
                    # 原因：对于已经运行的任务（特别是包含subprocess的任务），cancel()无法停止子进程，
                    # 反而会导致资源清理问题和线程错误。让任务自然在后台完成，避免强制中断。
                    logger.warning(f"注意：后台任务将继续完成（不会影响后续处理）")
                except Exception as e:
                    # 🔥 捕获任何其他异常（如subprocess线程错误），避免程序崩溃
                    logger.error(f"等待文件 {file_path} 加载时发生异常: {str(e)}")
                    logger.warning("忽略该异常，继续处理")
                    
            file_name = Path(file_path).stem
            file_extension = Path(file_path).suffix.lower()
            
            # 创建超时错误的文档
            timeout_text = f"""文件处理超时:
文件名: {file_name}{file_extension}
错误: 文件加载超过{timeout + 15}秒，已跳过处理

注意：此文件将被归类为"未分类"，您可以稍后手动处理。
可能的原因：
1. 文件损坏或格式异常
2. 文件过大或内容复杂
3. 系统资源不足
4. 网络连接缓慢（图片OCR识别缓慢）
5. AI服务响应缓慢
6. 图片分辨率过高或内容复杂

🚀 优化说明：为避免长时间卡死，已缩短超时时间。
💡 建议：可以尝试减小图片分辨率后重新整理。"""
            
            raise FileLoadTimeoutException(f"文件加载超时：{file_path}")

class FileClassification(BaseModel):
    category: str = Field(description="文档的主要分类类别，应该是一个具体且具有概括性的类别名称")
    subcategory: str = Field(description="文档的详细子类别，应该是一个具体的分类名称，必须与文档内容高度相关")
    confidence: float = Field(description="分类的置信度，范围在0到1之间")

# 🚀 优化：添加带超时的LLM调用函数，大幅缩短超时时间
@timeout_decorator(timeout_seconds=90)  # 🚀 从600秒（10分钟）缩短到90秒
def safe_classify_document(content: str, llm: ChatOpenAI, custom_prompt: str = None, 
                         filename: str = None, reference_items: dict = None, 
                         language: str = "Chinese") -> FileClassification:
    """带超时控制的文档分类函数（优化版）"""
    return classify_document(content, llm, custom_prompt, filename, reference_items, language)

@timeout_decorator(timeout_seconds=90)  # 🚀 从600秒（10分钟）缩短到90秒
def safe_match_existing_categories(content: str, existing_categories: List[str], llm: ChatOpenAI, 
                                 custom_prompt: str = "", filename: str = None, 
                                 reference_items: dict = None, language: str = "Chinese") -> str:
    """带超时控制的现有分类匹配函数（优化版）"""
    return match_existing_categories(content, existing_categories, llm, custom_prompt, 
                                   filename, reference_items, language)

# 类型定义已移到文件开头


def encode_image_to_base64(image_path: str) -> str:
    """将图片编码为base64字符串"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def recognize_image_with_llava(image_path: str, max_chars: int = 5000) -> str:
    """使用Ollama的LLaVA模型识别图片内容"""
    try:
        # 将图片转换为base64
        base64_image = encode_image_to_base64(image_path)

        # 准备请求数据
        data = {
            "model": "llava",
            "prompt": "请详细描述这张图片的内容，包括文字、场景、物体等所有可见信息。将返回的结果控制在100个文字以内",
            "images": [base64_image],
            "stream": False
        }

        # 发送请求到Ollama API
        response = requests.post("http://localhost:11434/api/generate", json=data)
        response.raise_for_status()

        # 获取识别结果
        result = response.json()
        text = result.get("response", "")
        # 限制文本长度
        return text[:max_chars]
    except Exception as e:
        logger.error(f"使用LLaVA模型识别图片失败: {str(e)}")
        return ""


def recognize_image_with_moonshot(image_path: str, api_key: str, max_chars: int = 5000) -> str:
    """使用Moonshot API识别图片内容 - 优化版本"""
    try:
        from openai import OpenAI
        import json

        logger.info(f"🌙 开始使用Moonshot API识别图片: {os.path.basename(image_path)}")
        
        # 创建API客户端
        client = OpenAI(
            base_url="https://api.moonshot.cn/v1",
            api_key=api_key
        )
        
        # 🚀 优化：直接使用文件上传和内容提取，这是最准确的方式
        file_object = client.files.create(file=Path(image_path), purpose="file-extract")
        file_content = client.files.content(file_id=file_object.id).text
        
        logger.info(f"🔍 文件上传成功，文件ID: {file_object.id}")

        # 🚀 关键优化：直接解析file_content中的JSON内容
        try:
            # file_content 通常是JSON格式，包含识别出的文字内容
            content_data = json.loads(file_content)
            extracted_text = content_data.get("content", "")
            
            if extracted_text and len(extracted_text.strip()) > 0:
                logger.info(f"✅ Moonshot API直接提取文字成功，内容长度: {len(extracted_text)} 字符")
                logger.info(f"【Moonshot API识别内容】 {os.path.basename(image_path)}:\n{extracted_text}")
                
                # 清理文件
                try:
                    client.files.delete(file_id=file_object.id)
                except:
                    pass  # 忽略删除文件的错误
                
                return extracted_text[:max_chars]
            else:
                logger.info("⚠️  直接提取未获得有效内容，尝试使用对话模式")
        except (json.JSONDecodeError, KeyError):
            logger.info("⚠️  文件内容不是JSON格式，尝试使用对话模式")
        
        # 🔄 备用方案：如果直接提取失败，使用对话模式
        messages = [
            {
                "role": "system",
                "content": file_content
            },
            {
                "role": "user",
                "content": "请提取这张图片中的所有文字内容，包括标题、正文、标签等。如果图片中没有文字，请描述图片的主要内容。请用简洁的方式输出，不要添加额外的解释。"
            }
        ]
        
        # 使用更适合的模型
        completion = client.chat.completions.create(
            model="moonshot-v1-8k",  # 使用更快的模型
            messages=messages,
            max_tokens=1000,  # 限制输出长度提高速度
            temperature=0.1   # 降低随机性，提高准确性
        )
        
        # 提取描述文本
        description = completion.choices[0].message.content
        logger.info(f"✅ Moonshot API对话模式识别成功，内容长度: {len(description)} 字符")
        logger.info(f"【Moonshot API对话模式识别内容】 {os.path.basename(image_path)}:\n{description}")
        
        # 清理文件
        try:
            client.files.delete(file_id=file_object.id)
        except:
            pass  # 忽略删除文件的错误
            
        return description[:max_chars]
        
    except Exception as e:
        error_msg = str(e)
        if "401" in error_msg or "Invalid Authentication" in error_msg:
            logger.error(f"❌ Moonshot API认证失败: {error_msg}")
            logger.error("💡 请检查API密钥是否正确")
        elif "403" in error_msg or "Forbidden" in error_msg:
            logger.error(f"❌ Moonshot API访问被拒绝: {error_msg}")
            logger.error("💡 可能原因：API密钥权限不足或账户余额不足")
        elif "429" in error_msg or "rate limit" in error_msg.lower():
            logger.error(f"❌ Moonshot API请求频率超限: {error_msg}")
            logger.error("💡 请稍后重试或检查API配额")
        else:
            logger.error(f"❌ 使用Moonshot API识别图片失败: {error_msg}")
        return ""


def recognize_image_with_online_model(image_path: str, api_key: str, api_base: str, model_name: str, max_chars: int = 5000) -> str:
    """使用用户配置的在线模型识别图片内容（ChatGPT/DeepSeek等）"""
    try:
        import base64
        from openai import OpenAI
        
        logger.info(f"🤖 开始使用在线模型识别图片: {model_name}, 图片: {os.path.basename(image_path)}")
        
        # 读取并编码图片
        with open(image_path, "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode('utf-8')
        
        # 创建API客户端
        client = OpenAI(
            base_url=api_base,
            api_key=api_key
        )
        
        # 构建消息
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "请详细分析这张图片的内容，包括：1. 图片中的所有文字内容（如果有）2. 主要物体和场景 3. 图片的用途或类型 4. 任何重要的细节信息。请用中文回答，内容要准确且有条理。"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
        
        # 调用API
        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_tokens=1000
        )
        
        # 提取结果
        description = response.choices[0].message.content
        logger.info(f"🤖 在线模型 {model_name} 识别成功，内容长度: {len(description)} 字符")
        logger.info(f"【在线模型识别内容】 {os.path.basename(image_path)} ({model_name}):\n{description}")
        return description[:max_chars]
        
    except Exception as e:
        logger.error(f"❌ 使用在线模型 {model_name} 识别图片失败: {str(e)}")
        return ""


def preprocess_image_for_ocr(image_path: str) -> str:
    """增强版图片预处理以提高OCR识别率"""
    try:
        import tempfile
        import uuid
        temp_path = os.path.join(tempfile.gettempdir(), f"enhanced_{uuid.uuid4().hex[:8]}.png")
        
        # 使用PIL进行基础预处理
        with Image.open(image_path) as img:
            # 转换为RGB模式
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # 增强对比度
            enhancer = ImageEnhance.Contrast(img)
            img = enhancer.enhance(1.5)
            
            # 增强锐度
            enhancer = ImageEnhance.Sharpness(img)
            img = enhancer.enhance(1.2)
            
            # 增强亮度
            enhancer = ImageEnhance.Brightness(img)
            img = enhancer.enhance(1.1)
            
            # 保存预处理后的图片
            img.save(temp_path, 'PNG', quality=95)
        
        # 如果有OpenCV，进行进一步处理
        if HAS_OPENCV:
            try:
                # 读取图片
                cv_img = cv2.imread(temp_path)
                if cv_img is not None:
                    # 转换为灰度图
                    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
                    
                    # 高斯模糊去噪
                    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
                    
                    # 自适应阈值处理
                    thresh = cv2.adaptiveThreshold(
                        blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                        cv2.THRESH_BINARY, 11, 2
                    )
                    
                    # 形态学操作去除噪点
                    kernel = np.ones((2, 2), np.uint8)
                    processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
                    
                    # 保存处理后的图片
                    cv2.imwrite(temp_path, processed)
                    logger.info(f"OpenCV高级预处理完成: {image_path}")
            except Exception as opencv_e:
                logger.warning(f"OpenCV处理失败，使用PIL结果: {str(opencv_e)}")
        
        logger.info(f"图片预处理完成: {image_path} -> {temp_path}")
        return temp_path
        
    except Exception as e:
        logger.warning(f"图片预处理失败，使用原始图片: {str(e)}")
        return image_path


def recognize_image_with_basic_ocr(image_path: str, max_chars: int = 5000) -> str:
    """使用增强版OCR方法识别图片内容"""
    processed_image_path = None
    try:
        logger.info("使用增强版OCR方法处理图片")
        
        # 1. 图片预处理
        processed_image_path = preprocess_image_for_ocr(image_path)
        
        # 2. 尝试使用 pytesseract
        try:
            # 原始图片OCR
            img_original = Image.open(image_path)
            text_original = pytesseract.image_to_string(img_original, lang='chi_sim+eng')
            
            # 预处理图片OCR
            img_processed = Image.open(processed_image_path)
            text_processed = pytesseract.image_to_string(img_processed, lang='chi_sim+eng')
            
            # 选择更好的结果
            if len(text_processed.strip()) > len(text_original.strip()):
                text = text_processed
                logger.info(f"增强版OCR识别效果更好，内容长度: {len(text)} 字符")
            elif text_original.strip():
                text = text_original
                logger.info(f"原始OCR识别效果更好，内容长度: {len(text)} 字符")
            else:
                text = text_processed  # 如果都不好，至少返回处理过的结果
                logger.info(f"OCR识别有限，内容长度: {len(text)} 字符")
            
            if text and text.strip():
                return text[:max_chars]
            else:
                logger.info("OCR未提取到有效文本")
                return ""
                
        except ImportError:
            logger.warning("pytesseract 未安装，无法使用OCR")
            return ""
        except Exception as e:
            logger.warning(f"OCR识别失败: {str(e)}")
            return ""
            
    except Exception as e:
        logger.error(f"增强版OCR方法失败: {str(e)}")
        return ""
    finally:
        # 清理临时文件
        if processed_image_path and processed_image_path != image_path:
            try:
                if os.path.exists(processed_image_path):
                    os.remove(processed_image_path)
            except Exception as e:
                logger.warning(f"清理临时文件失败: {str(e)}")


# ================== 文档处理最佳实践函数 ==================

def extract_docx_content(file_path: str, max_chars: int = 5000) -> tuple[str, str]:
    """使用最佳实践提取DOCX文档内容
    
    Returns:
        tuple: (extracted_text, method_used)
    """
    try:
        # 方法1：使用python-docx（首选）
        doc = docx.Document(file_path)
        paragraphs = []
        
        # 提取段落文本
        for para in doc.paragraphs:
            if para.text.strip():
                paragraphs.append(para.text.strip())
        
        # 提取表格内容
        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    cell_text = cell.text.strip()
                    if cell_text:
                        row_text.append(cell_text)
                if row_text:
                    paragraphs.append(" | ".join(row_text))
        
        text = "\n".join(paragraphs)[:max_chars]
        if text.strip():
            return text, "python-docx"
    except Exception as e:
        logger.debug(f"python-docx处理失败: {str(e)}")
    
    # 方法2：使用docx2txt（备用）
    if HAS_DOCX2TXT:
        try:
            text = docx2txt.process(file_path)
            if text and text.strip():
                return text[:max_chars], "docx2txt"
        except Exception as e:
            logger.debug(f"docx2txt处理失败: {str(e)}")
    

    return "", "未能提取内容"

def extract_doc_content(file_path: str, max_chars: int = 5000) -> tuple[str, str]:
    """使用最佳实践提取DOC文档内容（旧版Word）
    
    Returns:
        tuple: (extracted_text, method_used)
    """
    # 方法1：使用docx2txt（跨平台，首选）
    if HAS_DOCX2TXT:
        try:
            text = docx2txt.process(file_path)
            if text and text.strip():
                return text[:max_chars], "docx2txt"
        except Exception as e:
            logger.debug(f"docx2txt处理DOC失败: {str(e)}")
    
    # 方法2：使用win32com（Windows系统，需要Office）
    if HAS_WIN32COM:
        try:
            import pythoncom
            pythoncom.CoInitialize()
            word = win32com.client.Dispatch("Word.Application")
            word.Visible = False
            doc = word.Documents.Open(file_path)
            text = doc.Content.Text
            doc.Close()
            word.Quit()
            pythoncom.CoUninitialize()
            
            if text and text.strip():
                return text[:max_chars], "win32com"
        except Exception as e:
            logger.debug(f"win32com处理DOC失败: {str(e)}")
            try:
                pythoncom.CoUninitialize()
            except:
                pass
    
    # 方法3：尝试基础文本提取（回退方案）
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            # 尝试多种编码提取文本
            for encoding in ['utf-8', 'gbk', 'utf-16le', 'latin1']:
                try:
                    decoded = raw_data.decode(encoding, errors='ignore')
                    # 提取可读文本片段
                    import re
                    # 匹配连续的可打印字符（中英文）
                    text_matches = re.findall(r'[a-zA-Z\u4e00-\u9fff][a-zA-Z0-9\u4e00-\u9fff\s\.,;!?]{5,}', decoded)
                    if text_matches and len(text_matches) > 3:
                        text = ' '.join(text_matches[:20])  # 取前20个片段
                        if len(text.strip()) > 50:  # 确保有足够内容
                            return text[:max_chars], f"文本提取({encoding})"
                except:
                    continue
    except Exception as e:
        logger.debug(f"基础文本提取失败: {str(e)}")
    
    # 返回有用的错误信息和建议
    suggestion = """DOC文件处理建议:
1. 安装 docx2txt: pip install docx2txt
2. Windows用户可安装pywin32: pip install pywin32 (需要Office)
3. 或将DOC文件另存为DOCX格式以获得更好支持"""
    
    return suggestion, "处理建议"

def extract_xlsx_content(file_path: str, max_chars: int = 5000) -> tuple[str, str]:
    """使用最佳实践提取XLSX文档内容
    
    Returns:
        tuple: (extracted_text, method_used)
    """
    # 方法1：使用pandas（首选，功能最强）
    if HAS_PANDAS:
        try:
            # 读取所有工作表
            excel_file = pd.ExcelFile(file_path)
            all_content = []
            
            # 限制处理的工作表数量
            sheet_names = excel_file.sheet_names[:5]  # 最多处理5个工作表
            
            for sheet_name in sheet_names:
                try:
                    df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=50)  # 最多50行
                    if not df.empty:
                        sheet_content = f"工作表: {sheet_name}\n"
                        # 转换为字符串并处理缺失值
                        df_str = df.fillna('').astype(str)
                        sheet_content += df_str.to_string(index=False, max_rows=50)
                        all_content.append(sheet_content)
                except Exception as e:
                    logger.debug(f"pandas处理工作表 {sheet_name} 失败: {str(e)}")
                    continue
            
            if all_content:
                text = "\n\n".join(all_content)[:max_chars]
                return text, "pandas"
        except Exception as e:
            logger.debug(f"pandas处理XLSX失败: {str(e)}")
    
    # 方法2：使用openpyxl（备用）
    try:
        wb = openpyxl.load_workbook(file_path, read_only=True, data_only=True)
        all_content = []
        
        # 限制处理的工作表数量
        sheet_names = wb.sheetnames[:5]
        
        for sheet_name in sheet_names:
            try:
                sheet = wb[sheet_name]
                sheet_content = f"工作表: {sheet_name}\n"
                
                # 只读取有数据的区域
                for row_idx, row in enumerate(sheet.iter_rows(max_row=50, values_only=True)):
                    if row_idx >= 50:
                        break
                    # 过滤空行
                    row_values = [str(cell) if cell is not None else "" for cell in row]
                    if any(val.strip() for val in row_values):
                        sheet_content += " | ".join(row_values) + "\n"
                
                all_content.append(sheet_content)
            except Exception as e:
                logger.debug(f"openpyxl处理工作表 {sheet_name} 失败: {str(e)}")
                continue
        
        if all_content:
            text = "\n\n".join(all_content)[:max_chars]
            return text, "openpyxl"
    except Exception as e:
        logger.debug(f"openpyxl处理XLSX失败: {str(e)}")
    
    # 方法3：使用win32com（Windows Excel处理）
    if HAS_WIN32COM:
        try:
            import pythoncom
            pythoncom.CoInitialize()
            excel = win32com.client.Dispatch("Excel.Application")
            excel.Visible = False
            workbook = excel.Workbooks.Open(file_path)
            
            all_content = []
            for i, sheet in enumerate(workbook.Sheets):
                if i >= 5:  # 限制工作表数量
                    break
                try:
                    used_range = sheet.UsedRange
                    if used_range:
                        sheet_content = f"工作表: {sheet.Name}\n"
                        # 获取数据
                        data = used_range.Value
                        if data:
                            if isinstance(data[0], (list, tuple)):
                                for row in data[:50]:  # 限制行数
                                    row_str = " | ".join([str(cell) if cell is not None else "" for cell in row])
                                    if row_str.strip():
                                        sheet_content += row_str + "\n"
                            else:
                                sheet_content += str(data) + "\n"
                        all_content.append(sheet_content)
                except Exception as e:
                    logger.debug(f"win32com处理工作表失败: {str(e)}")
                    continue
            
            workbook.Close()
            excel.Quit()
            pythoncom.CoUninitialize()
            
            if all_content:
                text = "\n\n".join(all_content)[:max_chars]
                return text, "win32com"
        except Exception as e:
            logger.debug(f"win32com处理XLSX失败: {str(e)}")
            try:
                pythoncom.CoUninitialize()
            except:
                pass
    
    # 返回有用的处理建议
    suggestion = """XLSX文件处理建议:
1. 推荐安装: pip install pandas (最佳Excel处理方案)
2. 备用方案: pip install openpyxl
3. Windows用户: pip install pywin32 (需要Excel)"""
    
    return suggestion, "处理建议"

def extract_xls_content(file_path: str, max_chars: int = 5000) -> tuple[str, str]:
    """使用最佳实践提取XLS文档内容（旧版Excel）
    
    Returns:
        tuple: (extracted_text, method_used)
    """
    # 方法1：使用pandas（首选）
    if HAS_PANDAS:
        try:
            # pandas可以处理旧版Excel文件
            excel_file = pd.ExcelFile(file_path)
            all_content = []
            
            sheet_names = excel_file.sheet_names[:5]
            
            for sheet_name in sheet_names:
                try:
                    df = pd.read_excel(excel_file, sheet_name=sheet_name, nrows=50)
                    if not df.empty:
                        sheet_content = f"工作表: {sheet_name}\n"
                        df_str = df.fillna('').astype(str)
                        sheet_content += df_str.to_string(index=False, max_rows=50)
                        all_content.append(sheet_content)
                except Exception as e:
                    logger.debug(f"pandas处理XLS工作表 {sheet_name} 失败: {str(e)}")
                    continue
            
            if all_content:
                text = "\n\n".join(all_content)[:max_chars]
                return text, "pandas"
        except Exception as e:
            logger.debug(f"pandas处理XLS失败: {str(e)}")
    
    # 方法2：使用xlrd（专门处理旧版Excel）
    if HAS_XLRD:
        try:
            workbook = xlrd.open_workbook(file_path)
            all_content = []
            
            # 限制处理的工作表数量
            num_sheets = min(5, workbook.nsheets)
            
            for sheet_idx in range(num_sheets):
                try:
                    sheet = workbook.sheet_by_index(sheet_idx)
                    sheet_content = f"工作表: {sheet.name}\n"
                    
                    # 限制行数
                    max_rows = min(50, sheet.nrows)
                    
                    for row_idx in range(max_rows):
                        row_values = []
                        for col_idx in range(sheet.ncols):
                            cell_value = sheet.cell_value(row_idx, col_idx)
                            # 处理日期类型
                            if sheet.cell_type(row_idx, col_idx) == xlrd.XL_CELL_DATE:
                                try:
                                    date_tuple = xlrd.xldate_as_tuple(cell_value, workbook.datemode)
                                    cell_value = f"{date_tuple[0]}-{date_tuple[1]}-{date_tuple[2]}"
                                except:
                                    pass
                            row_values.append(str(cell_value) if cell_value is not None else "")
                        
                        # 过滤空行
                        if any(val.strip() for val in row_values):
                            sheet_content += " | ".join(row_values) + "\n"
                    
                    all_content.append(sheet_content)
                except Exception as e:
                    logger.debug(f"xlrd处理工作表 {sheet_idx} 失败: {str(e)}")
                    continue
            
            if all_content:
                text = "\n\n".join(all_content)[:max_chars]
                return text, "xlrd"
        except Exception as e:
            logger.debug(f"xlrd处理XLS失败: {str(e)}")
    
    # 方法3：使用win32com（Windows Excel处理）
    if HAS_WIN32COM:
        try:
            import pythoncom
            pythoncom.CoInitialize()
            excel = win32com.client.Dispatch("Excel.Application")
            excel.Visible = False
            workbook = excel.Workbooks.Open(file_path)
            
            all_content = []
            for i, sheet in enumerate(workbook.Sheets):
                if i >= 5:
                    break
                try:
                    used_range = sheet.UsedRange
                    if used_range:
                        sheet_content = f"工作表: {sheet.Name}\n"
                        data = used_range.Value
                        if data:
                            if isinstance(data[0], (list, tuple)):
                                for row in data[:50]:
                                    row_str = " | ".join([str(cell) if cell is not None else "" for cell in row])
                                    if row_str.strip():
                                        sheet_content += row_str + "\n"
                            else:
                                sheet_content += str(data) + "\n"
                        all_content.append(sheet_content)
                except Exception as e:
                    logger.debug(f"win32com处理XLS工作表失败: {str(e)}")
                    continue
            
            workbook.Close()
            excel.Quit()
            pythoncom.CoUninitialize()
            
            if all_content:
                text = "\n\n".join(all_content)[:max_chars]
                return text, "win32com"
        except Exception as e:
            logger.debug(f"win32com处理XLS失败: {str(e)}")
            try:
                pythoncom.CoUninitialize()
            except:
                pass
    
    # 返回有用的处理建议
    suggestion = """XLS文件处理建议:
1. 推荐安装: pip install pandas (自动处理新旧Excel格式)
2. 专门方案: pip install xlrd (旧版Excel专用)
3. Windows用户: pip install pywin32 (需要Excel)"""
    
    return suggestion, "处理建议"

def extract_pptx_content(file_path: str, max_chars: int = 5000, api_key: str = '', moonshot_key: str = '') -> tuple[str, str]:
    """使用最佳实践提取PPTX文档内容
    
    Returns:
        tuple: (extracted_text, method_used)
    """
    # 🚀 决定使用哪个 key 进行文件解析
    file_parse_key = moonshot_key if moonshot_key else api_key
    
    # 🚀 方法0：优先使用 Moonshot API（如果配置了）
    if file_parse_key:
        try:
            logger.info("🌙 尝试使用 Moonshot API 识别 PPTX 文件")
            moonshot_text = recognize_image_with_moonshot(file_path, file_parse_key, max_chars)
            
            if moonshot_text and len(moonshot_text.strip()) > 50:
                logger.info(f"✅ Moonshot API 成功识别 PPTX 文件，内容长度: {len(moonshot_text)} 字符")
                return moonshot_text, "Moonshot API"
            else:
                logger.info("⚠️  Moonshot API 未能有效识别 PPTX 内容，尝试其他方法")
        except Exception as e:
            logger.debug(f"Moonshot API 处理 PPTX 失败: {str(e)}")
    
    # 方法1：使用python-pptx（首选）
    try:
        presentation = pptx.Presentation(file_path)
        all_content = []
        
        # 限制处理的幻灯片数量
        total_slides = len(presentation.slides)
        max_slides = min(20, total_slides)
        
        logger.info(f"📊 PPTX 共 {total_slides} 页，准备提取前 {max_slides} 页")
        
        for i, slide in enumerate(presentation.slides):
            if i >= max_slides:
                break
                
            slide_content = f"幻灯片 {i+1}:\n"
            slide_texts = []
            
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text.strip():
                    slide_texts.append(shape.text.strip())
                
                # 处理表格
                if hasattr(shape, "table"):
                    table_content = []
                    for row in shape.table.rows:
                        row_text = []
                        for cell in row.cells:
                            if cell.text.strip():
                                row_text.append(cell.text.strip())
                        if row_text:
                            table_content.append(" | ".join(row_text))
                    if table_content:
                        slide_texts.extend(table_content)
            
            if slide_texts:
                slide_content += "\n".join(slide_texts)
                all_content.append(slide_content)
        
        if all_content:
            text = "\n\n".join(all_content)[:max_chars]
            logger.info(f"✅ python-pptx 成功提取 PPTX 内容，共 {len(all_content)} 页")
            return text, "python-pptx"
    except Exception as e:
        logger.debug(f"python-pptx处理失败: {str(e)}")
    
    # 方法2：使用优化的 win32com（Windows PowerPoint处理）
    if HAS_WIN32COM:
        ppt = None
        presentation = None
        try:
            import pythoncom
            pythoncom.CoInitialize()
            
            # 不设置 Visible 属性，使用 WithWindow=False 参数
            ppt = win32com.client.Dispatch("PowerPoint.Application")
            presentation = ppt.Presentations.Open(
                file_path, 
                ReadOnly=True,
                Untitled=True,
                WithWindow=False
            )
            
            all_content = []
            max_slides = min(20, presentation.Slides.Count)
            
            for i in range(1, max_slides + 1):
                try:
                    slide = presentation.Slides(i)
                    slide_content = f"幻灯片 {i}:\n"
                    slide_texts = []
                    
                    for shape in slide.Shapes:
                        if hasattr(shape, 'TextFrame') and shape.TextFrame.HasText:
                            text = shape.TextFrame.TextRange.Text.strip()
                            if text:
                                slide_texts.append(text)
                    
                    if slide_texts:
                        slide_content += "\n".join(slide_texts)
                        all_content.append(slide_content)
                except Exception as e:
                    logger.debug(f"提取幻灯片 {i} 失败: {str(e)}")
                    continue
            
            # 安全关闭
            if presentation:
                presentation.Close()
            if ppt:
                ppt.Quit()
            pythoncom.CoUninitialize()
            
            if all_content:
                text = "\n\n".join(all_content)[:max_chars]
                logger.info(f"✅ win32com 成功提取 PPTX 内容，共 {len(all_content)} 页")
                return text, "win32com"
        except Exception as e:
            logger.debug(f"win32com处理PPTX失败: {str(e)}")
            try:
                if presentation:
                    presentation.Close()
                if ppt:
                    ppt.Quit()
                pythoncom.CoUninitialize()
            except:
                pass
    
    # 返回有用的处理建议
    suggestion = """PPTX文件处理建议:
1. 推荐安装: pip install python-pptx (标准PPTX处理库)
2. Windows用户: pip install pywin32 (需要PowerPoint)
3. 或将PPTX内容复制到文本文件中"""
    
    return suggestion, "处理建议"

def extract_ppt_content(file_path: str, max_chars: int = 5000, api_key: str = '', moonshot_key: str = '') -> tuple[str, str]:
    """提取PPT文档内容（旧版PowerPoint）
    
    Returns:
        tuple: (extracted_text, method_used)
    """
    # 🚀 决定使用哪个 key 进行文件解析
    file_parse_key = moonshot_key if moonshot_key else api_key
    
    # 🚀 方法1：优先使用 Moonshot API（最推荐，无需PowerPoint）
    if file_parse_key:
        try:
            logger.info("🌙 尝试使用 Moonshot API 识别 PPT 文件")
            moonshot_text = recognize_image_with_moonshot(file_path, file_parse_key, max_chars)
            
            if moonshot_text and len(moonshot_text.strip()) > 50:
                logger.info(f"✅ Moonshot API 成功识别 PPT 文件，内容长度: {len(moonshot_text)} 字符")
                return moonshot_text, "Moonshot API"
            else:
                logger.info("⚠️  Moonshot API 未能有效识别 PPT 内容")
        except Exception as e:
            logger.debug(f"Moonshot API 处理 PPT 失败: {str(e)}")
    
    # 🚀 方法2：使用优化的 win32com（修复可见性问题）
    if HAS_WIN32COM:
        ppt = None
        presentation = None
        try:
            import pythoncom
            pythoncom.CoInitialize()
            
            # 🔥 关键修复：不设置 Visible 属性，让它保持默认状态
            # PowerPoint 在某些版本中不允许设置 Visible = False
            ppt = win32com.client.Dispatch("PowerPoint.Application")
            
            # 使用 WithWindow=False 参数打开，避免显示窗口
            # ReadOnly=True, WithWindow=False
            presentation = ppt.Presentations.Open(
                file_path, 
                ReadOnly=True,
                Untitled=True,
                WithWindow=False  # 不显示窗口
            )
            
            all_content = []
            total_slides = presentation.Slides.Count
            max_slides = min(20, total_slides)
            
            logger.info(f"📊 PPT 共 {total_slides} 页，准备提取前 {max_slides} 页")
            
            for i in range(1, max_slides + 1):
                try:
                    slide = presentation.Slides(i)
                    slide_content = f"幻灯片 {i}:\n"
                    slide_texts = []
                    
                    # 提取所有文本内容
                    for shape in slide.Shapes:
                        if hasattr(shape, 'TextFrame') and shape.TextFrame.HasText:
                            text = shape.TextFrame.TextRange.Text.strip()
                            if text:
                                slide_texts.append(text)
                    
                    if slide_texts:
                        slide_content += "\n".join(slide_texts)
                        all_content.append(slide_content)
                except Exception as e:
                    logger.debug(f"提取幻灯片 {i} 失败: {str(e)}")
                    continue
            
            # 安全关闭
            if presentation:
                presentation.Close()
            if ppt:
                ppt.Quit()
            pythoncom.CoUninitialize()
            
            if all_content:
                text = "\n\n".join(all_content)[:max_chars]
                logger.info(f"✅ win32com 成功提取 PPT 内容，共 {len(all_content)} 页")
                return text, "win32com"
            else:
                logger.warning("⚠️  win32com 未能提取到有效内容")
                
        except Exception as e:
            error_msg = str(e)
            logger.info(f"⚠️  win32com 处理 PPT 失败: {error_msg}")
            
            # 清理资源
            try:
                if presentation:
                    presentation.Close()
                if ppt:
                    ppt.Quit()
                pythoncom.CoUninitialize()
            except:
                pass
    
    # 🚀 方法3：增强的文本提取（回退方案）
    logger.info("📝 尝试使用文本提取方法识别 PPT 内容")
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            
            # 尝试多种编码提取文本
            for encoding in ['utf-16le', 'utf-8', 'gbk', 'latin1']:
                try:
                    decoded = raw_data.decode(encoding, errors='ignore')
                    import re
                    
                    # 提取连续的文本片段（改进正则表达式）
                    text_matches = re.findall(
                        r'[a-zA-Z\u4e00-\u9fff][a-zA-Z0-9\u4e00-\u9fff\s\.,;!?：；。，、？！""''（）()]{15,}', 
                        decoded
                    )
                    
                    if text_matches and len(text_matches) > 3:
                        # 清理和组织文本
                        cleaned_texts = []
                        for match in text_matches[:30]:  # 提取更多片段
                            # 去除过多空白字符
                            cleaned = ' '.join(match.split())
                            if len(cleaned) > 20:  # 只保留有意义的文本
                                cleaned_texts.append(cleaned)
                        
                        if cleaned_texts:
                            text = '\n\n'.join(cleaned_texts)
                            if len(text.strip()) > 100:
                                logger.info(f"✅ 文本提取成功，提取了 {len(cleaned_texts)} 个文本片段")
                                return text[:max_chars], f"文本提取({encoding})"
                except:
                    continue
                    
    except Exception as e:
        logger.debug(f"PPT文本提取失败: {str(e)}")
    
    # 🚀 方法4：提供智能处理建议
    logger.warning("⚠️  所有 PPT 识别方法都失败了")
    
    file_size = os.path.getsize(file_path) / 1024  # KB
    file_name = os.path.basename(file_path)
    
    suggestion = f"""【PPT 文件识别失败】

文件: {file_name}
大小: {file_size:.1f} KB

【推荐解决方案】

🌟 最佳方案（推荐）：
1. 配置 Moonshot API - 无需 PowerPoint，自动识别 PPT 内容
   - 在设置中填写 Moonshot API 密钥
   - 支持所有版本的 PPT/PPTX 文件
   - 识别准确率最高

🔧 替代方案：
1. 安装 Microsoft PowerPoint（Office 2007 或更高版本）
   - 然后运行: pip install pywin32
   
2. 将 PPT 转换为 PPTX 格式
   - 使用 PowerPoint 另存为 .pptx 格式
   - 或使用在线转换工具
   
3. 将内容导出为文本
   - 在 PowerPoint 中复制所有内容到文本文件
   
4. 使用多模态 AI
   - 在个人设置中配置多模态 AI 服务
   - 可以识别 PPT 的截图和内容

【说明】
此 PPT 文件可能是 Office 2003 格式（.ppt），处理较为困难。
强烈推荐使用 Moonshot API 或转换为 PPTX 格式。"""
    
    return suggestion, "处理建议"

# LibreOffice 转换函数已移除，专注于纯Python解决方案

def get_file_metadata(file_path: str) -> str:
    """获取文件基本元数据信息"""
    try:
        file_stat = os.stat(file_path)
        size_kb = file_stat.st_size / 1024
        created_time = time.ctime(file_stat.st_ctime)
        modified_time = time.ctime(file_stat.st_mtime)
        
        return f"""文件基本信息:
文件名: {Path(file_path).name}
大小: {size_kb:.2f} KB
创建时间: {created_time}
修改时间: {modified_time}"""
    except Exception:
        return f"文件: {Path(file_path).name}"

def diagnose_unstructured_issues() -> str:
    """诊断 unstructured 相关的安装问题"""
    issues = []
    solutions = []
    
    # 检查主要依赖
    try:
        import unstructured
        logger.info(f"✓ unstructured 已安装，版本: {unstructured.__version__}")
    except ImportError:
        issues.append("❌ unstructured 未安装")
        solutions.append("安装: pip install unstructured[all-docs]")
    except Exception as e:
        issues.append(f"❌ unstructured 导入错误: {str(e)}")
    
    try:
        from langchain_community.document_loaders import UnstructuredImageLoader
        logger.info("✓ UnstructuredImageLoader 可用")
    except ImportError as e:
        issues.append("❌ UnstructuredImageLoader 导入失败")
        error_msg = str(e).lower()
        if "pdfminer" in error_msg or "open_filename" in error_msg:
            solutions.append("pdfminer版本冲突: pip install pdfminer.six==20211012")
        elif "pi_heif" in error_msg:
            solutions.append("图片格式支持: pip install pillow-heif")
        else:
            solutions.append("langchain_community依赖: pip install langchain-community --upgrade")
    
    # 检查图片格式支持
    try:
        import pillow_heif
        logger.info("✓ pillow-heif 已安装，支持HEIF/HEIC格式")
    except ImportError:
        issues.append("⚠️ pillow-heif 未安装，无法处理HEIF/HEIC格式")
        solutions.append("HEIF支持: pip install pillow-heif")
    
    try:
        import pi_heif
        logger.info("✓ pi-heif 已安装")
    except ImportError:
        issues.append("⚠️ pi-heif 未安装，某些HEIF功能可能受限")
        solutions.append("HEIF支持: pip install pi-heif")
    
    # 检查OCR支持
    try:
        import pytesseract
        logger.info("✓ pytesseract 已安装，可作为备用OCR")
    except ImportError:
        issues.append("⚠️ pytesseract 未安装，缺少备用OCR支持")
        solutions.append("OCR支持: pip install pytesseract")
    
    # 生成诊断报告
    report = "=== UnstructuredImageLoader 诊断报告 ===\n"
    if issues:
        report += "\n发现的问题:\n" + "\n".join(issues)
    if solutions:
        report += "\n\n建议的解决方案:\n" + "\n".join(solutions)
    if not issues:
        report += "\n✓ 所有依赖检查通过"
    
    return report



def test_unstructured_availability() -> bool:
    """测试UnstructuredImageLoader是否可用"""
    try:
        if not HAS_UNSTRUCTURED:
            return False
        
        # 尝试创建一个简单的实例来测试是否真的可用
        # 这里不使用真实文件，只是测试导入是否正常
        from langchain_community.document_loaders import UnstructuredImageLoader
        return True
    except Exception as e:
        logger.warning(f"UnstructuredImageLoader 测试失败: {str(e)}")
        # 运行诊断并记录结果
        diagnosis = diagnose_unstructured_issues()
        logger.info(f"依赖诊断结果:\n{diagnosis}")
        return False


def recognize_image_with_unstructured(image_path: str, max_chars: int = 5000) -> str:
    """使用langchain的UnstructuredImageLoader识别图片内容"""
    try:
        if not HAS_UNSTRUCTURED:
            logger.warning("UnstructuredImageLoader未安装或导入失败，尝试使用备用方法")
            return recognize_image_with_basic_ocr(image_path, max_chars)
        
        # 首先测试 UnstructuredImageLoader 是否真的可用
        if not test_unstructured_availability():
            logger.warning("UnstructuredImageLoader 不可用，尝试使用备用方法")
            return recognize_image_with_basic_ocr(image_path, max_chars)
        
        logger.info("使用 Unstructured处理图片")
        
        # 检查文件是否存在
        if not os.path.exists(image_path):
            logger.error(f"图片文件不存在: {image_path}")
            return ""
        
        # 使用UnstructuredImageLoader加载图片
        loader = UnstructuredImageLoader(image_path)
        documents = loader.load()
        
        if documents and len(documents) > 0:
            # 提取文档内容
            content = documents[0].page_content
            if content and content.strip():
                logger.info(f"UnstructuredImageLoader识别成功，内容长度: {len(content)} 字符")
                return content[:max_chars]
            else:
                logger.info("UnstructuredImageLoader未提取到有效内容")
                return ""
        else:
            logger.info("UnstructuredImageLoader未返回任何文档")
            return ""
            
    except ImportError as import_error:
        logger.error(f"UnstructuredImageLoader导入错误: {str(import_error)}")
        return ""
    except Exception as e:
        error_msg = str(e)
        # 尝试使用备用方案
        logger.info("尝试使用备用OCR方法...")
        try:
            return recognize_image_with_basic_ocr(image_path, max_chars)
        except Exception as backup_error:
            logger.warning(f"备用OCR方法也失败: {str(backup_error)}")
            return ""


def load_document(file_path: str, max_chars: int = 5000, api_key: str = '', is_local: bool = False,
                  image_recognition_method: ImageRecognitionMethod = "tesseract", online_model: str = None, 
                  api_base: str = None, model_name: str = None, moonshot_key: Optional[str] = None):
    """根据文件扩展名加载文档的前N个字符"""
    file_extension = Path(file_path).suffix.lower()
    file_name = Path(file_path).stem
    
    # 🚀 新增：决定使用哪个 API key 进行文件解析
    # moonshot_key 专门用于 Moonshot API 的文件上传和解析（PDF、图片、Office等）
    # api_key 用于文本分析和分类
    file_parse_key = moonshot_key if moonshot_key else api_key
    logger.info(f"🔑 文件解析使用的 API key: {'moonshot_key' if moonshot_key else 'api_key'}")
    
    # 基本文件检查
    if not os.path.exists(file_path):
        logger.error(f"文件不存在: {file_path}")
        return []
    
    # 检查文件是否可读
    try:
        with open(file_path, 'rb') as test_file:
            test_file.read(1)  # 尝试读取1字节
    except PermissionError:
        logger.error(f"没有权限读取文件: {file_path}")
        return []
    except Exception as e:
        logger.error(f"文件读取测试失败: {file_path}, 错误: {str(e)}")
        return []
    
    # 🚀 优化：Office 文件优先使用 Moonshot API 识别（无论本地还是线上模式）
    # 获取完整内容，但分类时只使用前 2500 字符，节省 token
    office_extensions = ['.ppt', '.pptx', '.doc', '.docx', '.xls', '.xlsx']
    if file_parse_key and file_extension in office_extensions:
        try:
            logger.info(f"🌙 检测到 Office 文件 {file_extension}，优先使用 Moonshot API 识别")
            logger.info(f"📋 参数检查 - is_local: {is_local}, file_parse_key: {'已配置' if file_parse_key else '未配置'}")
            
            # 使用 Moonshot API 获取完整文件内容
            full_text = recognize_image_with_moonshot(file_path, file_parse_key, max_chars=10000)  # 获取更多内容
            
            if full_text and len(full_text.strip()) > 50:
                # 记录完整内容长度
                full_length = len(full_text)
                logger.info(f"✅ Moonshot API 成功识别 {file_extension} 文件，完整内容长度: {full_length} 字符")
                
                # 截取前 2500 字符用于分类，节省 token
                classification_text = full_text[:2500]
                actual_length = len(classification_text)
                
                if full_length > 2500:
                    logger.info(f"📊 为节省 token，分类时使用前 {actual_length} 字符（完整内容: {full_length} 字符）")
                    classification_text += f"\n\n[注：完整文档共 {full_length} 字符，此处仅显示前 {actual_length} 字符用于分类]"
                
                # 记录识别内容摘要
                logger.info(f"【Moonshot API 识别内容】 {file_name}{file_extension}（前200字）:\n{classification_text[:200]}...")
                
                return [Document(page_content=f"文件名：{file_name}\n\n{classification_text}")]
            else:
                logger.warning(f"⚠️  Moonshot API 返回内容为空或过短，将使用传统方法")
                
        except Exception as e:
            logger.error(f"⚠️  Moonshot API 识别 Office 文件失败: {str(e)}，将使用传统方法")
            import traceback
            logger.error(f"详细错误：{traceback.format_exc()}")
    else:
        # 如果是 Office 文件但没有 API key，给出提示
        if file_extension in office_extensions:
            logger.warning(f"⚠️  Office 文件 {file_extension} 未配置 Moonshot API，将使用传统方法（效果可能不佳）")
    
    try:
        if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
            logger.info(f"开始处理图片文件: {file_path}")
            
            # 🚀 优化：使用OCR优化器按优先级识别图片
            try:
                from lib.ocr_optimizer import ocr_optimizer
                text, recognition_method = ocr_optimizer.recognize_image_with_priority(
                    image_path=file_path,
                    max_chars=max_chars,
                    api_key=file_parse_key,  # 🚀 使用 file_parse_key 用于文件解析
                    is_local=is_local,
                    image_recognition_method=image_recognition_method,
                    online_model=online_model,
                    api_base=api_base,
                    model_name=model_name
                )
            except ImportError:
                # 如果OCR优化器不可用，使用原有逻辑
                text = ""
                recognition_method = "未知"
            
            # 🚀 优化：按照用户要求的优先级顺序进行图片识别
            # 1. 首先使用用户配置的Tesseract OCR
            if image_recognition_method in ["tesseract", "both"]:
                try:
                    # 导入多模态AI服务
                    from lib.multimodal_ai import multimodal_service
                    
                    # 使用智能多模态AI识别
                    success, content, cost, provider = multimodal_service.recognize_image_smart(
                        image_path=file_path,
                        prompt="""请详细分析这张图片的内容，包括：
1. 图片中的文字内容（如果有）
2. 主要物体和场景
3. 图片的用途或类型
4. 任何重要的细节信息

请用中文回答，内容要准确且有条理。"""
                    )
                    
                    if success and content:
                        text = content
                        recognition_method = f"多模态AI ({provider})"
                        logger.info(f"多模态AI识别成功，服务商: {provider}，成本: ${cost:.4f}")
                        logger.info(f"识别结果：\n{text}")
                    else:
                        logger.info(f"多模态AI识别失败: {content}")
                        
                        # 降级到传统方法
                        try:
                            logger.info("降级到Moonshot API")
                            moonshot_text = recognize_image_with_moonshot(file_path, file_parse_key, max_chars)
                            if moonshot_text:
                                text = moonshot_text
                                recognition_method = "Moonshot API (降级)"
                                logger.info(f"Moonshot API识别结果：\n{text}")
                        except Exception as moonshot_e:
                            logger.info(f"Moonshot API也失败: {str(moonshot_e)}")
                            
                            # 最后尝试Unstructured
                            try:
                                logger.info("尝试使用Unstructured作为最后备用")
                                unstructured_text = recognize_image_with_unstructured(file_path, max_chars)
                                if unstructured_text:
                                    text = unstructured_text
                                    recognition_method = "Unstructured (最后备用)"
                                    logger.info(f"Unstructured识别结果：\n{text}")
                            except Exception as unstructured_error:
                                logger.info(f"Unstructured也失败: {str(unstructured_error)}")
                        
                except Exception as multimodal_error:
                    logger.error(f"多模态AI服务调用失败: {str(multimodal_error)}")
                    
                    # 降级到原有方法
                    try:
                        # 使用Moonshot API处理图片
                        moonshot_text = recognize_image_with_moonshot(file_path, file_parse_key, max_chars)
                        if moonshot_text:
                            text = moonshot_text
                            recognition_method = "Moonshot API"
                            logger.info(f"Moonshot API识别结果：\n{text}")
                    except Exception as e:
                        logger.info(f"使用Moonshot API处理图片失败: {str(e)}")
                        
                        # 如果Moonshot API失败，尝试使用langchain的unstructured image方法
                        try:
                            logger.info("Moonshot API失败，尝试使用Unstructured")
                            unstructured_text = recognize_image_with_unstructured(file_path, max_chars)
                            if unstructured_text:
                                text = unstructured_text
                                recognition_method = "Unstructured (备用)"
                                logger.info(f"Unstructured识别结果：\n{text}")
                        except Exception as unstructured_error:
                            logger.info(f"使用Unstructured处理图片也失败: {str(unstructured_error)}")
            
            # 如果Moonshot API和UnstructuredImageLoader都失败或未配置，回退到传统方法
            if not text:
                # 优先尝试UnstructuredImageLoader（如果可用）
                if HAS_UNSTRUCTURED and test_unstructured_availability():
                    try:
                        logger.info("尝试使用Unstructured处理图片")
                        unstructured_text = recognize_image_with_unstructured(file_path, max_chars)
                        if unstructured_text:
                            text = unstructured_text
                            recognition_method = "Unstructured"
                            logger.info(f"Unstructured识别结果：\n{text}")
                    except Exception as e:
                        logger.info(f"使用Unstructured处理图片失败: {str(e)}")
                
                # 如果UnstructuredImageLoader失败，根据指定的方法进行图片识别
                if not text:
                    if image_recognition_method in ["tesseract"]:
                        try:
                            logger.info("使用pytesseract处理图片")
                            # 使用pytesseract处理图片
                            img = Image.open(file_path)
                            tesseract_text = pytesseract.image_to_string(img, lang='chi_sim+eng')
                            text = tesseract_text
                            recognition_method = "Tesseract OCR"
                            logger.info(f"pytesseract识别结果：\n{text}")
                        except Exception as e:
                            logger.info(f"使用pytesseract处理图片失败: {str(e)}")

                    if image_recognition_method in ["llava"]:
                        try:
                            # 使用LLaVA模型处理图片
                            llava_text = recognize_image_with_llava(file_path, max_chars)
                            if llava_text:
                                if text:
                                    text = f"{text}\n\nLLaVA识别结果：\n{llava_text}"
                                    recognition_method += " + LLaVA"
                                else:
                                    text = llava_text
                                    recognition_method = "LLaVA"
                        except Exception as e:
                            logger.info(f"使用LLaVA模型处理图片失败: {str(e)}")

            # 如果两种方法都失败，提取基本图片信息
            if not text:
                try:
                    img = Image.open(file_path)
                    # 提取基本图片信息
                    width, height = img.size
                    format_type = img.format
                    mode = img.mode
                    
                    # 构建基本图片描述
                    img_info = f"""图片信息:
- 文件名: {file_name}{file_extension}
- 尺寸: {width}x{height}像素
- 格式: {format_type}
- 颜色模式: {mode}"""
                    
                    text = img_info
                    recognition_method = "基本图片信息提取"
                except Exception as img_info_error:
                    logger.error(f"提取图片基本信息也失败: {str(img_info_error)}")
                    # 使用最基本的描述
                    text = f"这是一个图片文件: {file_name}{file_extension}"
                    recognition_method = "默认描述"

            # 限制文本长度
            text = text[:max_chars]
            
            # 添加详细的识别日志
            logger.info(f"图片文件识别完成 - 文件路径: {file_path}, 识别方式: {recognition_method}, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
            
            return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
        
        # 🎬 视频文件处理
        elif file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.m4v', '.mpg', '.mpeg']:
            logger.info(f"开始处理视频文件: {file_path}")
            
            try:
                from lib.video_processor import video_analyzer, ffmpeg_manager
                
                # 检查ffmpeg是否安装
                if not ffmpeg_manager.is_installed():
                    logger.warning("ffmpeg未安装，视频分类功能不可用。请在设置中下载安装ffmpeg")
                    text = f"视频文件: {file_name}{file_extension}\n(需要安装ffmpeg才能进行内容分析)\n\n请在个人设置中点击'视频处理插件'按钮下载安装ffmpeg"
                    return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                
                # 分析视频（提取最多5个关键帧以获得更全面的内容理解）
                logger.info(f"使用ffmpeg提取视频关键帧...")
                analysis_result = video_analyzer.analyze_video(file_path, num_frames=5)
                
                # 获取分析摘要
                text = analysis_result['summary']
                
                # 如果有帧分析结果，添加更多细节（包含所有分析的关键帧）
                if analysis_result.get('frame_analyses'):
                    text += "\n\n【关键帧详细分析】"
                    for i, frame_analysis in enumerate(analysis_result['frame_analyses']):
                        # 包含完整的关键帧分析内容，帮助AI更好地理解视频
                        text += f"\n关键帧{i+1}: {frame_analysis['content']}"
                
                # 限制文本长度（为视频内容预留更多空间）
                text = text[:max_chars * 2]  # 视频文件允许更长的描述
                
                logger.info(f"视频文件分析完成 - 文件路径: {file_path}, 识别方式: FFmpeg+多模态AI, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                
                return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                
            except Exception as e:
                logger.error(f"视频内容提取失败: {str(e)}")
                
                # 降级到智能文件名分析
                try:
                    file_stat = os.stat(file_path)
                    file_size = file_stat.st_size
                    
                    # 分析文件名
                    import re
                    name_features = []
                    
                    # 检测日期
                    if re.search(r'\d{1,2}月\d{1,2}日', file_name):
                        name_features.append('包含日期信息（可能是按日期记录的个人视频）')
                    
                    # 检测关键词
                    keywords_map = {
                        '会议': '工作会议', '项目': '工作项目', '演示': '演示展示',
                        '教程': '教育培训', '课程': '教育培训', '学习': '教育培训',
                        '旅游': '旅行记录', '旅行': '旅行记录', 'vlog': '个人生活',
                        '生活': '个人生活', '聚会': '个人生活'
                    }
                    
                    detected_category = None
                    for keyword, category in keywords_map.items():
                        if keyword.lower() in file_name.lower():
                            detected_category = category
                            name_features.append(f'检测到关键词"{keyword}"，建议分类为：{category}')
                            break
                    
                    # 构建智能降级描述
                    text = f"""【视频文件智能分类】

文件名: {file_name}{file_extension}
文件大小: {file_size / (1024*1024):.1f} MB

【分析说明】
由于视频内容提取失败（错误: {str(e)}），系统无法识别视频画面内容。
但系统已基于文件名和元数据进行了智能分析：

"""
                    if name_features:
                        text += "文件名特征：\n- " + "\n- ".join(name_features) + "\n\n"
                    
                    if detected_category:
                        text += f"【推荐分类】{detected_category}\n\n"
                    else:
                        # 没有检测到关键词时的默认建议
                        if file_size < 10 * 1024 * 1024:  # < 10MB
                            text += "【推荐分类】个人记录/短视频（基于文件较小）\n\n"
                        else:
                            text += "【推荐分类】个人记录/生活片段（基于无明显特征）\n\n"
                    
                    text += """【重要提示】
请不要将此视频归类为"未分类项目"或"视频文件"等模糊分类。
建议根据上述推荐分类或文件名含义创建具体的分类。"""
                    
                    return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                    
                except Exception as fallback_error:
                    logger.error(f"视频降级分析也失败: {str(fallback_error)}")
                    text = f"""【视频文件】
文件名: {file_name}{file_extension}

由于技术原因无法分析视频内容，但请不要将其归类为"未分类项目"。
建议根据文件名创建合适的分类，如"个人记录"、"生活片段"等。"""
                return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
        
        elif file_extension in ['.txt', '.md', '.rtf', '.log', '.ini', '.csv', '.json', '.xml', '.yaml', '.yml']:
            # 使用内置open函数处理文本文件
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read(max_chars)
            
            # 添加详细的识别日志
            logger.info(f"文本文件识别完成 - 文件路径: {file_path}, 识别方式: 内置open函数, 识别内容: {content[:100]}{'...' if len(content) > 100 else ''}")
            
            return [Document(page_content=f"文件名：{file_name}\n\n{content}")]
        elif file_extension == '.pdf':
            # 🚀 新增：优先使用Moonshot API处理扫描型PDF和图片型PDF
            text = ""
            pdf_extracted_successfully = False
            recognition_method = "PyPDF2"
            
            # 🚀 调试日志
            logger.info(f"🔍 PDF处理条件检查 - is_local: {is_local}, api_key: {'有' if api_key else '无'}")
            logger.info(f"🔍 online_model: {online_model}, model_name: {model_name}")
            logger.info(f"🔍 api_base: {api_base}")
            
            # 检查是否为非本地模型且有API密钥，优先尝试Moonshot处理
            # 支持deepseek/kimi配置或moonshot模型
            moonshot_conditions = (
                online_model in ['deepseek', 'kimi'] or 
                (model_name and 'moonshot' in model_name.lower()) or
                (api_base and 'moonshot' in api_base.lower())
            )
            
            logger.info(f"🔍 moonshot_conditions: {moonshot_conditions}")
            
            if not is_local and file_parse_key and moonshot_conditions:
                try:
                    from lib.pdf_moonshot_processor import get_pdf_moonshot_processor
                    processor = get_pdf_moonshot_processor(file_parse_key)  # 🚀 使用 file_parse_key 用于文件解析
                    
                    if processor.is_available():
                        logger.info(f"🌙 检测到非本地模型({online_model})，优先使用Moonshot处理PDF: {file_path}")
                        
                        # 🚀 对所有PDF都使用Moonshot处理（前10页临时PDF方式）
                        logger.info("使用Moonshot API处理PDF（前10页临时PDF方式）")
                        
                        # 使用Moonshot处理PDF
                        moonshot_result = processor.process_pdf_with_moonshot(
                            file_path=file_path,
                            custom_prompt="请详细提取并总结这个PDF文档的所有文字内容，包括中文和英文。",
                            max_pages=10
                        )
                        
                        if moonshot_result:
                            text = moonshot_result[:max_chars]
                            pdf_extracted_successfully = True
                            recognition_method = "Moonshot API (前10页临时PDF)"
                            logger.info(f"🎉 Moonshot API处理PDF成功，内容长度: {len(text)} 字符")
                            
                            return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                        else:
                            logger.warning("⚠️ Moonshot API处理PDF失败，回退到传统方法")
                    else:
                        logger.info("Moonshot处理器不可用，使用传统PDF处理方法")
                
                except ImportError:
                    logger.info("PDF Moonshot处理器模块未找到，使用传统PDF处理方法")
                except Exception as e:
                    logger.warning(f"Moonshot处理器初始化失败: {str(e)}，回退到传统方法")
            
            # 传统PDF处理方法（PyPDF2）
            
            try:
                reader = PyPDF2.PdfReader(file_path)
                # 读取前几页文本
                for i in range(min(3, len(reader.pages))):
                    page_text = reader.pages[i].extract_text()
                    if page_text:
                        text += page_text + "\n\n"
                
                # 检查是否提取到了有效文本（去除空白字符后长度大于50表示有效）
                if text.strip() and len(text.strip()) > 50:
                    pdf_extracted_successfully = True
                    text = text[:max_chars]
                    
                    # 添加详细的识别日志
                    logger.info(f"PDF文件识别完成 - 文件路径: {file_path}, 识别方式: PyPDF2, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                    
                    return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                else:
                    logger.info(f"PyPDF2提取的文本内容过少，可能是图片型PDF，尝试使用OCR方法")
                    
            except Exception as e:
                logger.error(f"使用PyPDF2处理PDF文件失败: {str(e)}")
            
            # 如果PyPDF2提取失败或提取内容过少，尝试使用PyMuPDF进行OCR处理
            if not pdf_extracted_successfully:
                try:
                    # 优先使用PyMuPDF (fitz) 处理PDF，避免poppler依赖
                    import fitz  # PyMuPDF
                    logger.info(f"使用PyMuPDF将PDF转换为图片进行OCR识别: {file_path}")
                    
                    # 打开PDF文档
                    pdf_doc = fitz.open(file_path)
                    ocr_results = []
                    
                    # 处理前3页
                    for page_num in range(min(3, pdf_doc.page_count)):
                        logger.info(f"正在处理PDF第{page_num+1}页...")
                        page = pdf_doc[page_num]
                        
                        # 将页面渲染为图像（相当于150 DPI）
                        mat = fitz.Matrix(1.5, 1.5)  # 150% 缩放
                        pix = page.get_pixmap(matrix=mat)
                        
                        # 保存为临时PNG文件
                        temp_img_path = None
                        try:
                            # 创建临时文件路径
                            import uuid
                            temp_filename = f"pdf_page_{uuid.uuid4().hex[:8]}.png"
                            temp_img_path = os.path.join(tempfile.gettempdir(), temp_filename)
                            
                            # 保存图片
                            pix.save(temp_img_path)
                            
                            # 立即释放pixmap资源
                            pix = None
                            
                            page_text = ""
                            page_recognition_method = "未知"
                            
                            # 🚀 优化：使用OCR优化器按优先级识别PDF页面
                            try:
                                from lib.ocr_optimizer import ocr_optimizer
                                page_text, page_recognition_method = ocr_optimizer.recognize_image_with_priority(
                                    image_path=temp_img_path,
                                    max_chars=max_chars//3,
                                    api_key=api_key,
                                    is_local=is_local,
                                    image_recognition_method=image_recognition_method,
                                    online_model=online_model,
                                    api_base=api_base,
                                    model_name=model_name
                                )
                                if page_text:
                                    logger.info(f"PDF第{page_num+1}页识别成功: {page_recognition_method}")
                                else:
                                    logger.warning(f"PDF第{page_num+1}页所有OCR方法均失败")
                            except ImportError:
                                # 如果OCR优化器不可用，使用原有逻辑
                                logger.info(f"OCR优化器不可用，使用原有逻辑处理PDF第{page_num+1}页")
                                # 1. 首先使用用户配置的Tesseract
                                if image_recognition_method in ["tesseract", "both"] or is_local:
                                    try:
                                        logger.info(f"PDF第{page_num+1}页优先使用Tesseract OCR（用户配置）")
                                        img = Image.open(temp_img_path)
                                        tesseract_text = pytesseract.image_to_string(img, lang='chi_sim+eng')
                                        img.close()  # 确保关闭图片文件
                                        if tesseract_text and len(tesseract_text.strip()) > 5:
                                            page_text = tesseract_text
                                            page_recognition_method = "Tesseract OCR (用户配置)"
                                            logger.info(f"PDF第{page_num+1}页使用Tesseract识别成功，内容长度: {len(tesseract_text)} 字符")
                                    except Exception as tesseract_e:
                                        logger.warning(f"PDF第{page_num+1}页使用Tesseract失败: {str(tesseract_e)}")
                            
                            # 2. 如果Tesseract失败，尝试Unstructured
                            if not page_text:
                                if HAS_UNSTRUCTURED and test_unstructured_availability():
                                    try:
                                        logger.info(f"PDF第{page_num+1}页Tesseract失败，尝试Unstructured")
                                        unstructured_text = recognize_image_with_unstructured(temp_img_path, max_chars//3)
                                        if unstructured_text and len(unstructured_text.strip()) > 10:
                                            page_text = unstructured_text
                                            page_recognition_method = "Unstructured (备用方案)"
                                            logger.info(f"PDF第{page_num+1}页使用Unstructured识别成功，内容长度: {len(unstructured_text)} 字符")
                                    except Exception as unstructured_e:
                                        logger.warning(f"PDF第{page_num+1}页使用Unstructured失败: {str(unstructured_e)}")
                            
                            # 3. 如果Unstructured失败，检查是否配置了多模态AI
                            if not page_text and not is_local and api_key:
                                # 检查是否有多模态AI配置
                                try:
                                    from lib.multimodal_ai import multimodal_service
                                    if multimodal_service.is_available():
                                        logger.info(f"PDF第{page_num+1}页尝试使用多模态AI")
                                        success, content, cost, provider = multimodal_service.recognize_image_smart(
                                            image_path=temp_img_path,
                                            prompt="请详细识别这张图片中的所有文字内容，包括中文和英文。"
                                        )
                                        if success and content and len(content.strip()) > 10:
                                            page_text = content
                                            page_recognition_method = f"多模态AI ({provider})"
                                            logger.info(f"PDF第{page_num+1}页使用多模态AI识别成功，成本: ${cost:.4f}")
                                        else:
                                            logger.info(f"PDF第{page_num+1}页多模态AI识别失败或结果为空")
                                except ImportError:
                                    logger.info(f"PDF第{page_num+1}页多模态AI服务未配置")
                                except Exception as multimodal_e:
                                    logger.warning(f"PDF第{page_num+1}页多模态AI识别失败: {str(multimodal_e)}")
                                
                                # 如果多模态AI失败，尝试Moonshot API作为备用
                                if not page_text:
                                    try:
                                        logger.info(f"PDF第{page_num+1}页尝试使用Moonshot API作为备用")
                                        moonshot_text = recognize_image_with_moonshot(temp_img_path, file_parse_key, max_chars//3)  # 🚀 使用 file_parse_key
                                        if moonshot_text and len(moonshot_text.strip()) > 10:
                                            page_text = moonshot_text
                                            page_recognition_method = "Moonshot API (备用方案)"
                                            logger.info(f"PDF第{page_num+1}页使用Moonshot API识别成功")
                                    except Exception as moonshot_e:
                                        logger.warning(f"PDF第{page_num+1}页使用Moonshot API失败: {str(moonshot_e)}")
                            
                                # 原有逻辑的备用处理...
                                pass
                            
                            # 添加页面识别结果
                            if page_text:
                                ocr_results.append(f"--- PDF第{page_num+1}页 ---\n{page_text.strip()}")
                                recognition_method = f"PyMuPDF+{page_recognition_method}"
                            else:
                                logger.warning(f"PDF第{page_num+1}页未能识别出任何文本")
                                ocr_results.append(f"--- PDF第{page_num+1}页 ---\n[无法识别文本内容]")
                        
                        except Exception as page_error:
                            logger.error(f"处理PDF第{page_num+1}页时出错: {str(page_error)}")
                            ocr_results.append(f"--- PDF第{page_num+1}页 ---\n[处理页面时出错: {str(page_error)}]")
                        
                        finally:
                            # 清理资源
                            if 'pix' in locals() and pix:
                                try:
                                    pix = None
                                except:
                                    pass
                            
                            # 清理临时图片文件（更安全的方式）
                            if temp_img_path and os.path.exists(temp_img_path):
                                try:
                                    # 强制垃圾回收，释放可能的文件句柄
                                    import gc
                                    gc.collect()
                                    
                                    # 等待一小段时间确保文件句柄被释放
                                    time.sleep(0.1)
                                    
                                    # 尝试删除文件
                                    os.unlink(temp_img_path)
                                    logger.debug(f"成功删除临时文件: {temp_img_path}")
                                except PermissionError:
                                    # 如果权限不足，记录但不中断处理
                                    logger.warning(f"无法删除临时文件 {temp_img_path}，可能被其他进程占用")
                                except Exception as cleanup_e:
                                    logger.warning(f"清理临时文件失败: {str(cleanup_e)}")
                    
                    pdf_doc.close()
                    
                    # 如果成功识别到内容
                    if ocr_results:
                        ocr_text = "\n\n".join(ocr_results)
                        text = ocr_text[:max_chars]
                        pdf_extracted_successfully = True
                        
                        # 添加详细的识别日志
                        logger.info(f"PDF文件识别完成 - 文件路径: {file_path}, 识别方式: {recognition_method}, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                        
                        return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                    else:
                        logger.error("使用PyMuPDF处理PDF后所有页面都未能识别出文本")
                
                except ImportError:
                    logger.warning("PyMuPDF库未安装，无法处理图片型PDF。请运行: pip install PyMuPDF")
                    logger.info("PyMuPDF是pdf2image+poppler的更好替代方案")
                except Exception as pymupdf_e:
                    logger.error(f"使用PyMuPDF处理PDF失败: {str(pymupdf_e)}")
                    # 检查是否是文件权限问题（跨平台兼容）
                    error_str = str(pymupdf_e).lower()
                    if "permission denied" in error_str or "access is denied" in error_str or "operation not permitted" in error_str:
                        logger.warning("检测到文件权限问题，可能是临时文件清理时的权限限制")
                        logger.info("建议解决方案:")
                        logger.info("• 确保程序有足够的文件操作权限")
                        logger.info("• 关闭可能占用临时文件的其他程序")
                        logger.info("• 重新运行程序")
                        
                        # 如果是权限问题但已经有OCR结果，仍然返回结果
                        if 'ocr_results' in locals() and ocr_results:
                            ocr_text = "\n\n".join(ocr_results)
                            text = ocr_text[:max_chars]
                            pdf_extracted_successfully = True
                            recognition_method = f"PyMuPDF+{page_recognition_method if 'page_recognition_method' in locals() else 'OCR'}"
                            
                            logger.info(f"PDF文件识别完成 - 文件路径: {file_path}, 识别方式: {recognition_method} (忽略清理错误), 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                            
                            return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                
                # 如果PyMuPDF不可用或处理失败，尝试使用unstructured直接处理PDF
                if not pdf_extracted_successfully and HAS_UNSTRUCTURED:
                    try:
                        logger.info("尝试使用Unstructured直接处理PDF")
                        from langchain_community.document_loaders import UnstructuredPDFLoader
                        loader = UnstructuredPDFLoader(file_path)
                        docs = loader.load()
                        if docs and docs[0].page_content:
                            text = docs[0].page_content[:max_chars]
                            pdf_extracted_successfully = True
                            recognition_method = "Unstructured PDF"
                            
                            # 添加详细的识别日志
                            logger.info(f"PDF文件识别完成 - 文件路径: {file_path}, 识别方式: {recognition_method}, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                            
                            return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                    except Exception as unstructured_pdf_e:
                        logger.error(f"使用Unstructured直接处理PDF失败: {str(unstructured_pdf_e)}")
                

            
            # 所有方法都失败，返回基本信息和安装指导
            if not pdf_extracted_successfully:
                try:
                    # 获取PDF基本信息
                    reader = PyPDF2.PdfReader(file_path)
                    page_count = len(reader.pages)
                    
                    # 获取文件元数据
                    file_stat = os.stat(file_path)
                    size_kb = file_stat.st_size / 1024
                    created_time = time.ctime(file_stat.st_ctime)
                    modified_time = time.ctime(file_stat.st_mtime)
                    
                    error_msg = f"""PDF文件基本信息:
文件名: {file_name}{file_extension}
页数: {page_count}
大小: {size_kb:.2f} KB
创建时间: {created_time}
修改时间: {modified_time}

🔧 无法提取内容，建议解决方案:
1. 确保PyMuPDF已安装: pip install PyMuPDF
2. 如果是加密PDF，请先解密
3. 检查PDF文件是否完整且未损坏
4. 尝试用其他PDF阅读器打开查看内容"""
                    
                    # 添加详细的识别日志
                    logger.info(f"PDF文件识别失败 - 文件路径: {file_path}, 识别方式: 基本信息提取+安装指导")
                    
                    return [Document(page_content=f"文件名：{file_name}\n\n{error_msg}")]
                except Exception as final_e:
                    error_msg = f"""无法提取PDF内容：{str(final_e)}

这可能是一个损坏的PDF文件或需要特殊工具处理的图片型PDF。

🔧 建议解决方案:
1. 检查PDF文件是否完整且未损坏
2. 安装PyMuPDF: pip install PyMuPDF (推荐，无需poppler)
3. 如果是加密PDF，请先解密
4. 尝试用其他PDF阅读器打开查看内容"""
                    
                    # 添加详细的识别日志
                    logger.info(f"PDF文件识别失败 - 文件路径: {file_path}, 识别方式: 最终失败+错误指导, 错误信息: {str(final_e)}")
                    
                    return [Document(page_content=f"文件名：{file_name}\n\n{error_msg}")]
        elif file_extension in ['.ppt', '.pptx']:
            # 使用最佳实践处理PowerPoint文件（支持 Moonshot API）
            if file_extension == '.pptx':
                text, method = extract_pptx_content(file_path, max_chars, api_key, file_parse_key)
            else:  # .ppt - 传递 api_key 以支持 Moonshot API 识别
                text, method = extract_ppt_content(file_path, max_chars, api_key, file_parse_key)
            
            if text:
                logger.info(f"PPT文件识别完成 - 文件路径: {file_path}, 识别方式: {method}, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
            else:
                # 如果所有方法都失败，返回基本信息
                metadata = get_file_metadata(file_path)
                error_msg = f"""{metadata}

注意：无法提取此PowerPoint文件的详细内容。
建议解决方案：
1. 安装LibreOffice以获得最佳兼容性
2. 确保文件未损坏
3. 如需查看内容，请直接使用Microsoft PowerPoint打开文件"""
                
                logger.info(f"PPT文件识别失败 - 文件路径: {file_path}, 识别方式: 元数据提取")
                return [Document(page_content=f"文件名：{file_name}\n\n{error_msg}")]

        elif file_extension == '.html':
            # 使用BeautifulSoup处理HTML文件
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    soup = BeautifulSoup(f.read(), 'html.parser')
                # 提取文本内容，去除脚本和样式
                for script in soup(["script", "style"]):
                    script.extract()
                text = soup.get_text()
                # 清理文本
                lines = (line.strip() for line in text.splitlines())
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                text = '\n'.join(chunk for chunk in chunks if chunk)[:max_chars]
                
                # 添加详细的识别日志
                logger.info(f"HTML文件识别完成 - 文件路径: {file_path}, 识别方式: BeautifulSoup, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                
                return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
            except Exception as e:
                logger.error(f"处理HTML文件时出错: {str(e)}")
                error_msg = f"无法提取HTML内容: {str(e)}"
                
                # 添加详细的识别日志
                logger.info(f"HTML文件识别失败 - 文件路径: {file_path}, 识别方式: BeautifulSoup, 错误信息: {str(e)}")
                
                return [Document(page_content=f"文件名：{file_name}\n\n{error_msg}")]
        elif file_extension in ['.doc', '.docx']:
            # 🚀 新增：优先使用Moonshot API处理Word文档（非本地模型）
            text = ""
            word_extracted_successfully = False
            recognition_method = "传统方法"
            
            # 检查是否为非本地模型且有API密钥，优先尝试Moonshot处理
            # 支持deepseek/kimi配置或moonshot模型
            moonshot_conditions = (
                online_model in ['deepseek', 'kimi'] or 
                (model_name and 'moonshot' in model_name.lower()) or
                (api_base and 'moonshot' in api_base.lower())
            )
            
            if not is_local and file_parse_key and moonshot_conditions:
                try:
                    from lib.pdf_moonshot_processor import get_pdf_moonshot_processor
                    processor = get_pdf_moonshot_processor(file_parse_key)  # 🚀 使用 file_parse_key 用于文件解析
                    
                    if processor.is_available():
                        logger.info(f"🌙 检测到非本地模型({online_model})，尝试使用Moonshot处理Word文档: {file_path}")
                        
                        # 使用Moonshot处理Word文档
                        moonshot_result = processor.process_word_with_moonshot(
                            file_path=file_path,
                            custom_prompt="请详细提取并总结这个Word文档的所有内容，包括文字、表格和格式信息。"
                        )
                        
                        if moonshot_result:
                            text = moonshot_result[:max_chars]
                            word_extracted_successfully = True
                            recognition_method = "Moonshot API"
                            logger.info(f"Moonshot API处理Word文档成功，内容长度: {len(text)} 字符")
                            
                            return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
                        else:
                            logger.warning("Moonshot API处理Word文档失败，回退到传统方法")
                    else:
                        logger.info("Moonshot处理器不可用，使用传统Word处理方法")
                
                except ImportError:
                    logger.info("PDF Moonshot处理器模块未找到，使用传统Word处理方法")
                except Exception as e:
                    logger.warning(f"Moonshot处理器初始化失败: {str(e)}，回退到传统方法")
            
            # 传统Word处理方法
            if not word_extracted_successfully:
                if file_extension == '.docx':
                    text, method = extract_docx_content(file_path, max_chars)
                else:  # .doc
                    text, method = extract_doc_content(file_path, max_chars)
                recognition_method = method
            
            if text:
                logger.info(f"Word文件识别完成 - 文件路径: {file_path}, 识别方式: {recognition_method}, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
            else:
                # 如果所有方法都失败，返回基本信息
                metadata = get_file_metadata(file_path)
                error_msg = f"""{metadata}

注意：无法提取此Word文档的详细内容。
建议解决方案：
1. 安装docx2txt以获得更好的DOC文件兼容性: pip install docx2txt
2. 安装LibreOffice以获得最佳跨平台兼容性
3. 确保文件未损坏
4. 如需查看内容，请直接使用Microsoft Word打开文件"""
                
                logger.info(f"Word文件识别失败 - 文件路径: {file_path}, 识别方式: 元数据提取")
                return [Document(page_content=f"文件名：{file_name}\n\n{error_msg}")]

        elif file_extension in ['.xls', '.xlsx']:
            # 使用最佳实践处理Excel文件
            if file_extension == '.xlsx':
                text, method = extract_xlsx_content(file_path, max_chars)
            else:  # .xls
                text, method = extract_xls_content(file_path, max_chars)
            
            if text:
                logger.info(f"Excel文件识别完成 - 文件路径: {file_path}, 识别方式: {method}, 识别内容: {text[:100]}{'...' if len(text) > 100 else ''}")
                return [Document(page_content=f"文件名：{file_name}\n\n{text}")]
            else:
                # 如果所有方法都失败，返回基本信息
                metadata = get_file_metadata(file_path)
                error_msg = f"""{metadata}

注意：无法提取此Excel文件的详细内容。
建议解决方案：
1. 安装pandas以获得最佳Excel处理能力: pip install pandas
2. 对于旧版XLS文件，确保安装xlrd: pip install xlrd
3. 安装LibreOffice以获得最佳跨平台兼容性
4. 确保文件未损坏
5. 如需查看内容，请直接使用Microsoft Excel打开文件"""
                
                logger.info(f"Excel文件识别失败 - 文件路径: {file_path}, 识别方式: 元数据提取")
                return [Document(page_content=f"文件名：{file_name}\n\n{error_msg}")]
        else:
            # 添加详细的识别日志
            logger.info(f"文件识别失败 - 文件路径: {file_path}, 识别方式: 不支持的文件类型, 错误信息: 不支持的文件类型: {file_extension}")
            
            raise ValueError(f"不支持的文件类型: {file_extension}")
    except Exception as e:
        logger.error(f"加载文件 {file_name} 时出错: {str(e)}")
        
        # 添加详细的识别日志
        logger.info(f"文件识别异常 - 文件路径: {file_path}, 识别方式: 异常处理, 错误信息: {str(e)}")
        
        return []


def retry_on_rate_limit(func, *args, max_retries=3, base_delay=2, **kwargs):
    """专门处理429错误的重试装饰器函数
    
    Args:
        func: 要执行的函数
        max_retries: 最大重试次数
        base_delay: 基础延迟时间（秒）
        *args, **kwargs: 传递给函数的参数
        
    Returns:
        函数执行结果
    """
    import time
    import random
    
    for attempt in range(max_retries + 1):  # 总共执行max_retries+1次（第一次不算重试）
        try:
            return func(*args, **kwargs)
        except Exception as e:
            error_msg = str(e).lower()
            
            # 检查是否是429错误
            is_rate_limit_error = (
                "429" in error_msg or 
                "too many requests" in error_msg or 
                "rate limit" in error_msg or
                "quota exceeded" in error_msg
            )
            
            if is_rate_limit_error and attempt < max_retries:
                # 计算延迟时间：指数退避 + 随机抖动
                delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
                logger.warning(f"遇到速率限制错误，等待 {delay:.1f} 秒后重试 (尝试 {attempt + 1}/{max_retries})")
                logger.warning(f"错误详情: {str(e)}")
                time.sleep(delay)
                continue
            else:
                # 不是429错误，或者已经达到最大重试次数，直接抛出异常
                if is_rate_limit_error:
                    logger.error(f"429错误重试 {max_retries} 次后仍然失败，放弃处理")
                raise e
    
    # 理论上不会到达这里
    raise Exception("重试逻辑异常")


def classify_document_with_custom_prompt_priority(content: str, llm: ChatOpenAI, custom_prompt: str = None, 
                     filename: str = None, reference_items: dict = None, language: str = "Chinese") -> FileClassification:
    """使用LLM对文档内容进行分类 - 自定义提示词优先级版本

    Args:
        content: 文档内容
        llm: LLM模型
        custom_prompt: 用户自定义的提示词
        filename: 文件名（用于参考分类）
        reference_items: 参考项设置，包含是否参考内容、文件名等信息
        language: 语言设置
    """
    
    # 🎯 如果有自定义提示词，使用自定义提示词优先级分类器
    if custom_prompt and custom_prompt.strip():
        from lib.custom_prompt_priority import classify_with_custom_prompt_priority
        file_path = filename if filename else "unknown_file"
        
        logger.info(f"🎯 使用自定义提示词优先级分类: {custom_prompt}")
        return classify_with_custom_prompt_priority(
            file_path=file_path,
            content=content,
            llm=llm,
            custom_prompt=custom_prompt,
            reference_items=reference_items,
            existing_categories=None,  # 这里可以传入现有分类
            context_analysis=None,     # 这里可以传入上下文分析
            language=language
        )
    
    # 如果没有自定义提示词，使用原始分类方法
    return classify_document(content, llm, custom_prompt, filename, reference_items, language)


def classify_document(content: str, llm: ChatOpenAI, custom_prompt: str = None, 
                     filename: str = None, reference_items: dict = None, language: str = "Chinese") -> FileClassification:
    """使用LLM对文档内容进行分类 - 原始版本

    Args:
        content: 文档内容
        llm: LLM模型
        custom_prompt: 用户自定义的提示词
        filename: 文件名（用于参考分类）
        reference_items: 参考项设置，包含是否参考内容、文件名等信息
    """
    parser = PydanticOutputParser(pydantic_object=FileClassification)

    # 检查参考项设置
    use_content = True
    use_filename = False
    if reference_items:
        use_content = reference_items.get('content', True)
        use_filename = reference_items.get('filename', False)
    
    # 构建分析内容
    analysis_content = ""
    
    # 如果需要参考文件名，添加文件名信息
    if use_filename and filename:
        analysis_content += f"文件名：{filename}\n\n"
    
    # 如果需要参考内容，添加文件内容
    if use_content and content:
        analysis_content += content
    elif not use_content and use_filename and filename:
        # 仅使用文件名的情况
        analysis_content = f"文件名：{filename}"
    elif not analysis_content:
        # 如果既不使用内容也不使用文件名，回退到使用内容
        analysis_content = content

    # 构建基础系统提示词
    if language == "English":
        base_prompt = """You are a professional document classification expert. Please carefully analyze the provided information and classify it into the most appropriate category.

CRITICAL REQUIREMENT: ALL category and subcategory names MUST be in English, regardless of the language of the document content.

Classification principles:
1. Create broad, general classifications, avoid excessive subdivision
2. Avoid creating too many similar categories, try to group similar content under the same category
3. Prioritize the main purpose and core content of the document
4. If a filename is provided, focus on analyzing keywords in the filename for classification
5. If document content spans multiple domains, choose the most primary or relevant classification
6. Category names should be concise and clear, easy to understand and manage
7. Subcategories must be highly relevant to the provided information, not too broad or vague
8. If the primary classification name is already specific and accurate enough, no need to create subcategories
9. Be strict when evaluating classification confidence, only give high confidence when very certain about the classification
10. Try to minimize creating new categories, prioritize using broader classification names

LANGUAGE REQUIREMENT: Even if the document content is in Chinese, Japanese, or any other language, you MUST output category and subcategory names in English only. For example:
- Chinese "简历" should be classified as "Resume" or "CV"
- Chinese "合同" should be classified as "Contract"
- Chinese "报告" should be classified as "Report"
- Chinese "教育" should be classified as "Education"
- Chinese "财务" should be classified as "Finance"

DO NOT use any Chinese characters in category or subcategory names."""
    else:
        base_prompt = """你是一个专业的文档分类专家。请仔细分析提供的信息并将其分类到最合适的类别中。

分类原则：
1. 创建宽泛、一般性的分类，避免过度细分类别
2. 避免创建太多相似的类别，尽量将相似内容归为同一类别
3. 优先考虑文档的主要用途和核心内容
4. 如果提供了文件名，要重点分析文件名中的关键词进行分类
5. 如果文档内容跨多个领域，选择最主要或最相关的分类
6. 分类名称应该简洁明了，便于理解和管理
7. 子类别必须与提供的信息高度相关，不能过于宽泛或模糊
8. 如果一级分类名称已经足够具体和准确，就不需要创建子类别
9. 评估分类的置信度时要严格，只有对分类非常有把握时才给出高置信度
10. 尽量减少创建新类别，优先使用更广泛的分类名称

**【视频文件专项分类指导】**
对于视频文件，请特别注意：
1. **避免使用"视频文件"、"多媒体"等模糊分类** - 这些分类没有意义
2. **根据视频内容主题分类** - 教学视频→"教育与培训"，会议录像→"工作会议"，生活记录→"个人生活"
3. **分析文件名中的关键信息**：
   - 包含日期（如"3月15日.mp4"）→ 可能是"个人记录"或"生活片段"
   - 包含关键词（如"会议"、"教程"、"演示"）→ 使用该关键词对应的主题分类
   - 无明显特征的短视频 → "生活记录/随手拍"
4. **参考视频时长和大小**：
   - 短小视频（< 3分钟）→ 可能是社交媒体内容、片段记录
   - 中等时长（3-15分钟）→ 可能是教程、演示、生活分享
   - 长视频（> 15分钟）→ 可能是课程、会议、完整作品
5. **创建有意义的分类**：
   - 推荐：教育与培训、工作会议、个人生活、技术演示、产品展示、旅行记录
   - 避免：视频文件、多媒体文件、未分类项目"""

    # 根据参考项调整提示词
    if not use_content and use_filename:
        if language == "English":
            base_prompt += """

Note: Currently classifying based on filename only, please focus on analyzing key information in the filename:
- Keywords in the filename (such as project name, document type, date, etc.)
- Naming patterns and structure of the filename
- Document purpose and nature inferred from the filename
Please make the most reasonable classification judgment based on filename information."""
        else:
            base_prompt += """

注意：当前仅根据文件名进行分类，请重点分析文件名中的关键信息：
- 文件名中的关键词（如项目名、文档类型、日期等）
- 文件名的命名规律和结构
- 从文件名推断的文档用途和性质
请基于文件名信息做出最合理的分类判断。"""
    elif use_filename and use_content:
        if language == "English":
            base_prompt += """

Note: Currently classifying based on both filename and file content:
- The filename provides important classification clues, please focus on it
- Combine file content to verify and refine classification
- If filename and content point to different classifications, prioritize the filename indication"""
        else:
            base_prompt += """

注意：当前同时参考文件名和文件内容进行分类：
- 文件名提供了重要的分类线索，请重点考虑
- 结合文件内容验证和细化分类
- 如果文件名和内容指向不同分类，优先考虑文件名的指向"""

    # 构建输出格式说明
    if language == "English":
        format_instructions = r"""
Output format requirements:
1. Must be a valid JSON object
2. Must include the following fields:
   - category: Main classification category of the document (MUST be in English)
   - subcategory: Detailed subcategory of the document (MUST be in English)
   - confidence: Classification confidence (number between 0-1)
3. Example output format for English classification:
{{
  "category": "Resume",
  "subcategory": "Personal Resume",
  "confidence": 0.95
}}

IMPORTANT: Even if the document content is in Chinese (like "简历"), you must classify it using English terms:
{{
  "category": "Resume",
  "subcategory": "Template",
  "confidence": 0.90
}}

Please ensure to return a complete JSON object, do not include any other text, code blocks, backticks or explanations. Only return a valid JSON object with English category names."""
    else:
        format_instructions = r"""
输出格式要求：
1. 必须是一个有效的JSON对象
2. 必须包含以下字段：
   - category: 文档的主要分类类别
   - subcategory: 文档的详细子类别
   - confidence: 分类的置信度（0-1之间的数字）
3. 示例输出格式：
{{
  "category": "技术文档",
  "subcategory": "API文档",
  "confidence": 0.95
}}

请确保返回完整的JSON对象，不要包含任何其他文本、代码块、反引号或解释。仅返回一个合法的JSON对象。"""

    # 构建完整的系统提示词
    if custom_prompt:
        # 构建自定义提示词的额外上下文说明
        context_note = ""
        if not use_content and use_filename:
            if language == "English":
                context_note = """

Note: Currently classifying based on filename only, please focus on analyzing key information in the filename:
- Keywords in the filename (such as project name, document type, date, etc.)
- Naming patterns and structure of the filename
- Document purpose and nature inferred from the filename
Please make the most reasonable classification judgment based on filename information."""
            else:
                context_note = """

注意：当前仅根据文件名进行分类，请重点分析文件名中的关键信息：
- 文件名中的关键词（如项目名、文档类型、日期等）
- 文件名的命名规律和结构
- 从文件名推断的文档用途和性质
请基于文件名信息做出最合理的分类判断。"""
        elif use_filename and use_content:
            if language == "English":
                context_note = """

Note: Currently classifying based on both filename and file content:
- The filename provides important classification clues, please focus on it
- Combine file content to verify and refine classification
- If filename and content point to different classifications, prioritize the filename indication"""
            else:
                context_note = """

注意：当前同时参考文件名和文件内容进行分类：
- 文件名提供了重要的分类线索，请重点考虑
- 结合文件内容验证和细化分类
- 如果文件名和内容指向不同分类，优先考虑文件名的指向"""
        
        if language == "English":
            system_prompt = f"""You are a professional document classification expert. Please classify documents according to the following user-defined classification rules:

{custom_prompt}

While following the above user requirements, please also adhere to the following classification principles:
1. Create broad, general classifications, avoid excessive subdivision
2. Avoid creating too many similar categories, try to group similar content under the same category
3. Prioritize the main purpose and core content of the document
4. If a filename is provided, focus on analyzing keywords in the filename for classification
5. If document content spans multiple domains, choose the most primary or relevant classification
6. Classification names should be concise and clear, easy to understand and manage
7. Subcategories must be highly relevant to the provided information, not too broad or vague
8. If the primary classification name is already specific and accurate enough, no need to create subcategories
9. Be strict when evaluating classification confidence, only give high confidence when very certain about the classification
10. Try to minimize creating new categories, prioritize using broader classification names
11. If unable to determine specific subcategory, set subcategory to "general" (Note: when subcategory is "general", files will be placed directly in the main category folder){context_note}

{format_instructions}"""
        else:
            system_prompt = f"""你是一个专业的文档分类专家。请按照以下用户自定义分类规则对文档进行分类：

{custom_prompt}

在遵循上述用户要求的同时，也请遵守以下分类原则：
1. 创建宽泛、一般性的分类，避免过度细分类别
2. 避免创建太多相似的类别，尽量将相似内容归为同一类别
3. 优先考虑文档的主要用途和核心内容
4. 如果提供了文件名，要重点分析文件名中的关键词进行分类
5. 如果文档内容跨多个领域，选择最主要或最相关的分类
6. 分类名称应该简洁明了，便于理解和管理
7. 子类别必须与提供的信息高度相关，不能过于宽泛或模糊
8. 如果一级分类名称已经足够具体和准确，就不需要创建子类别
9. 评估分类的置信度时要严格，只有对分类非常有把握时才给出高置信度
10. 尽量减少创建新类别，优先使用更广泛的分类名称
11. 如果无法确定具体的子类别，应该将subcategory设置为"general"（注意：当subcategory为"general"时，文件会被直接放入主类别文件夹中）{context_note}

{format_instructions}"""
    else:
        system_prompt = base_prompt + format_instructions

    # 创建一个没有任何花括号的简化版本系统提示词，避免格式化问题
    if language == "English":
        simplified_system_prompt = """You are a professional document classification expert. Please analyze the provided information and classify it into appropriate categories.

CRITICAL: All category and subcategory names MUST be in English, regardless of document content language.
        
Prioritize creating broad, general classifications, avoid excessive subdivision, and try to minimize creating new categories. For documents with similar themes, use the same category.

Please output a JSON object containing the following fields:
- category: Main classification category of the document (MUST be in English, use broad category names)
- subcategory: Detailed subcategory of the document (MUST be in English, use "general" if classification is already specific enough)
- confidence: Classification confidence (number between 0-1)

Example output for Chinese resume documents:
{{"category": "Resume", "subcategory": "Personal Resume", "confidence": 0.95}}

Do not add any additional text, explanations or formatting, only return valid JSON with English category names."""
    else:
        simplified_system_prompt = """你是一个专业的文档分类专家。请分析提供的信息并将其分类到合适的类别中。
        
优先创建宽泛、一般性的分类，避免过度细分，并尽量减少创建新类别。对于相似主题的文档，应使用相同的类别。

请输出一个JSON对象，包含以下字段：
- category: 文档的主要分类类别（使用宽泛的类别名称）
- subcategory: 文档的详细子类别（如果分类已足够具体，使用"general"）
- confidence: 分类的置信度（0-1之间的数字）

示例输出：
{{"category": "技术文档", "subcategory": "API文档", "confidence": 0.95}}

不要添加任何额外的文本、解释或格式，只返回有效的JSON。"""

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("user", "请对以下信息进行分类，创建宽泛且通用的类别，避免过度细分：\n\n{content}" if language != "English" else "Please classify the following information, create broad and general categories, avoid excessive subdivision:\n\n{content}")
    ])

    # 获取模型类型，检查是否为Ollama模型
    model_type = llm.__class__.__name__
    is_ollama = model_type == "Ollama"
    
    # 只有非Ollama模型才设置超时和重试参数
    if not is_ollama:
        # 设置LLM的超时和重试参数
        original_timeout = getattr(llm, "request_timeout", 60)
        original_max_retries = getattr(llm, "max_retries", 2)
        
        try:
            # 动态调整超时时间，根据内容长度
            content_length = len(analysis_content)
            # 🚀 优化：大幅缩短LLM调用超时时间
            if content_length > 1500:
                timeout = 120  # 长内容使用2分钟超时（从5分钟缩短）
            elif content_length > 800:
                timeout = 90   # 中等内容使用1.5分钟超时（从6分钟缩短）
            else:
                timeout = 60   # 短内容使用1分钟超时（从3分钟缩短）
            
            llm.request_timeout = timeout
            llm.max_retries = 3

            # 多次尝试，每次使用不同的策略，并加入429错误重试机制
            for attempt in range(3):
                try:
                    if attempt == 0:
                        # 第一次尝试：完整的链式调用，使用429重试机制
                        def call_with_parser():
                            chain = prompt | llm | parser
                            return chain.invoke({"content": analysis_content})
                        
                        result = retry_on_rate_limit(call_with_parser, max_retries=3, base_delay=2)
                        return result
                        
                    elif attempt == 1:
                        # 第二次尝试：直接调用模型并手动解析，使用429重试机制
                        def call_raw_model():
                            raw_chain = prompt | llm
                            return raw_chain.invoke({"content": analysis_content})
                        
                        raw_result = retry_on_rate_limit(call_raw_model, max_retries=3, base_delay=2)
                        
                        # 清理输出，去除可能的代码块标记和额外空格
                        raw_text = raw_result.content
                        # 去除可能的markdown代码块
                        if "```json" in raw_text:
                            raw_text = raw_text.split("```json")[1].split("```")[0].strip()
                        elif "```" in raw_text:
                            raw_text = raw_text.split("```")[1].split("```")[0].strip()
                        
                        # 尝试解析JSON
                        parsed_json = json.loads(raw_text)
                        # 兼容首字母大写的字段名（Ollama常见问题）
                        if 'Category' in parsed_json and 'category' not in parsed_json:
                            parsed_json['category'] = parsed_json['Category']
                        if 'Subcategory' in parsed_json and 'subcategory' not in parsed_json:
                            parsed_json['subcategory'] = parsed_json['Subcategory']
                        if 'Description' in parsed_json and 'subcategory' not in parsed_json:
                            parsed_json['subcategory'] = "general"
                        if 'Confidence' in parsed_json and 'confidence' not in parsed_json:
                            parsed_json['confidence'] = parsed_json['Confidence']
                        # 如果有Description但没有confidence，设置默认值
                        if 'Description' in parsed_json and 'confidence' not in parsed_json:
                            parsed_json['confidence'] = 0.8
                            
                        # 验证必要的字段
                        if not all(k in parsed_json for k in ['category', 'subcategory', 'confidence']):
                            logger.warning(f"JSON缺少必要字段，尝试从现有字段构建: {parsed_json}")
                            # 尝试从现有字段构建完整对象
                            if 'category' not in parsed_json and 'Category' in parsed_json:
                                parsed_json['category'] = parsed_json['Category']
                            elif 'category' not in parsed_json:
                                default_category = "Unclassified" if language == "English" else "未分类"
                                parsed_json['category'] = default_category
                                
                            if 'subcategory' not in parsed_json:
                                parsed_json['subcategory'] = "general"
                                
                            if 'confidence' not in parsed_json:
                                parsed_json['confidence'] = 0.7
                        
                        default_category = "Unclassified" if language == "English" else "未分类"
                        default_subcategory = "Pending" if language == "English" else "待处理"
                        return FileClassification(
                            category=parsed_json.get('category', default_category),
                            subcategory=parsed_json.get('subcategory', default_subcategory),
                            confidence=float(parsed_json.get('confidence', 0.0))
                        )
                    else:
                        # 第三次尝试：使用简化的提示词，使用429重试机制
                        if language == "English":
                            simple_prompt = ChatPromptTemplate.from_messages([
                                ("system", "You are a document classification expert. Please return classification results in JSON format, containing category, subcategory, confidence fields. All category names must be in English."),
                                ("user", "Please classify this document: {content}")
                            ])
                        else:
                            simple_prompt = ChatPromptTemplate.from_messages([
                                ("system", "你是文档分类专家。请用JSON格式返回分类结果，包含category、subcategory、confidence字段。"),
                                ("user", "请分类这个文档：{content}")
                            ])
                        
                        def call_simple_model():
                            simple_chain = simple_prompt | llm
                            return simple_chain.invoke({"content": analysis_content[:1000]})  # 限制内容长度
                        
                        raw_result = retry_on_rate_limit(call_simple_model, max_retries=3, base_delay=2)
                        
                        # 尝试从简化响应中提取信息
                        raw_text = raw_result.content
                        
                        # 使用正则表达式提取可能的分类信息
                        import re
                        category_match = re.search(r'"?category"?\s*:\s*"([^"]+)"', raw_text, re.IGNORECASE)
                        subcategory_match = re.search(r'"?subcategory"?\s*:\s*"([^"]+)"', raw_text, re.IGNORECASE)
                        confidence_match = re.search(r'"?confidence"?\s*:\s*([0-9.]+)', raw_text, re.IGNORECASE)
                        
                        default_category = "Unclassified" if language == "English" else "未分类"
                        category = category_match.group(1) if category_match else default_category
                        subcategory = subcategory_match.group(1) if subcategory_match else "general"
                        confidence = float(confidence_match.group(1)) if confidence_match else 0.5
                        
                        return FileClassification(
                            category=category,
                            subcategory=subcategory,
                            confidence=confidence
                        )
                        
                except Exception as attempt_error:
                    error_msg = str(attempt_error).lower()
                    is_rate_limit_error = (
                        "429" in error_msg or 
                        "too many requests" in error_msg or 
                        "rate limit" in error_msg
                    )
                    
                    if is_rate_limit_error:
                        logger.warning(f"分类尝试 {attempt + 1} 遇到速率限制: {str(attempt_error)}")
                    else:
                        logger.warning(f"分类尝试 {attempt + 1} 失败: {str(attempt_error)}")
                    
                    if attempt < 2:  # 不是最后一次尝试
                        # 对于非429错误，正常延迟
                        if not is_rate_limit_error:
                            time.sleep(1 * (attempt + 1))
                    else:
                        raise  # 最后一次尝试失败，抛出异常
                
        except Exception as e:
            logger.error(f"分类过程出错: {str(e)}")
        finally:
            # 恢复原始设置 (仅对非Ollama模型)
            try:
                llm.request_timeout = original_timeout
                llm.max_retries = original_max_retries
            except:
                pass  # 忽略恢复设置时的错误
    else:
        # Ollama模型的处理路径
        try:
            # 对于Ollama模型，直接使用链式调用
            try:
                chain = prompt | llm | parser
                result = chain.invoke({"content": analysis_content})
                return result
            except Exception as e1:
                logger.error(f"Ollama完整链式调用失败，尝试直接解析模型输出: {str(e1)}")
                
                # 如果链式调用失败，尝试获取原始输出并手动解析
                try:
                    raw_chain = prompt | llm
                    raw_result = raw_chain.invoke({"content": analysis_content})
                    
                    # 清理输出，去除可能的代码块标记和额外空格，以及思考过程
                    raw_text = clean_llm_response(raw_result.content)
                    # 去除可能的markdown代码块
                    if "```json" in raw_text:
                        raw_text = raw_text.split("```json")[1].split("```")[0].strip()
                    elif "```" in raw_text:
                        raw_text = raw_text.split("```")[1].split("```")[0].strip()
                    
                    # 尝试解析JSON
                    try:
                        parsed_json = json.loads(raw_text)
                        # 兼容首字母大写的字段名（Ollama常见问题）
                        if 'Category' in parsed_json and 'category' not in parsed_json:
                            parsed_json['category'] = parsed_json['Category']
                        if 'Subcategory' in parsed_json and 'subcategory' not in parsed_json:
                            parsed_json['subcategory'] = parsed_json['Subcategory']
                        if 'Description' in parsed_json and 'subcategory' not in parsed_json:
                            parsed_json['subcategory'] = "general"
                        if 'Confidence' in parsed_json and 'confidence' not in parsed_json:
                            parsed_json['confidence'] = parsed_json['Confidence']
                        # 如果有Description但没有confidence，设置默认值
                        if 'Description' in parsed_json and 'confidence' not in parsed_json:
                            parsed_json['confidence'] = 0.8
                            
                        # 验证必要的字段
                        if not all(k in parsed_json for k in ['category', 'subcategory', 'confidence']):
                            logger.warning(f"JSON缺少必要字段，尝试从现有字段构建: {parsed_json}")
                            # 尝试从现有字段构建完整对象
                            if 'category' not in parsed_json and 'Category' in parsed_json:
                                parsed_json['category'] = parsed_json['Category']
                            elif 'category' not in parsed_json:
                                default_category = "Unclassified" if language == "English" else "未分类"
                                parsed_json['category'] = default_category
                                
                            if 'subcategory' not in parsed_json:
                                parsed_json['subcategory'] = "general"
                                
                            if 'confidence' not in parsed_json:
                                parsed_json['confidence'] = 0.7
                        
                        default_category = "Unclassified" if language == "English" else "未分类"
                        default_subcategory = "Pending" if language == "English" else "待处理"
                        return FileClassification(
                            category=parsed_json.get('category', default_category),
                            subcategory=parsed_json.get('subcategory', default_subcategory),
                            confidence=float(parsed_json.get('confidence', 0.0))
                        )
                    except Exception as e3:
                        logger.error(f"JSON解析失败: {str(e3)}，原始文本: {raw_text}")
                        raise
                except Exception as e2:
                    logger.error(f"备用解析方法也失败: {str(e2)}")
                    raise
                
        except Exception as e:
            logger.error(f"Ollama分类过程出错: {str(e)}")
    
    # 如果所有方法都失败，返回默认分类
    if language == "English":
        return FileClassification(
            category="Unclassified",
            subcategory="Pending",
            confidence=0.0
        )
    else:
        return FileClassification(
            category="未分类",
            subcategory="待处理",
            confidence=0.0
        )


def remove_empty_directories(directory: str):
    """递归删除空目录"""
    for root, dirs, files in os.walk(directory, topdown=False):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            if not os.listdir(dir_path):  # 如果目录为空
                try:
                    os.rmdir(dir_path)
                    logger.info(f"已删除空目录: {dir_path}")
                except Exception as e:
                    logger.error(f"删除目录 {dir_path} 时出错: {str(e)}")


def match_existing_categories(content: str, existing_categories: List[str], llm: ChatOpenAI, 
                             custom_prompt: str = "", filename: str = None, 
                             reference_items: dict = None, language: str = "Chinese") -> str:
    """根据文档内容匹配最合适的已有分类"""
    # 检查参考项设置
    use_content = True
    use_filename = False
    if reference_items:
        use_content = reference_items.get('content', True)
        use_filename = reference_items.get('filename', False)
    
    # 构建分析内容
    analysis_content = ""
    
    # 如果需要参考文件名，添加文件名信息
    if use_filename and filename:
        analysis_content += f"文件名：{filename}\n\n"
    
    # 如果需要参考内容，添加文件内容
    if use_content and content:
        analysis_content += content
    elif not use_content and use_filename and filename:
        # 仅使用文件名的情况
        analysis_content = f"文件名：{filename}"
    elif not analysis_content:
        # 如果既不使用内容也不使用文件名，回退到使用内容
        analysis_content = content
    
    # 如果没有分析内容，返回未匹配
    if not analysis_content:
        logger.info("警告：没有提供有效的分析内容，无法进行匹配")
        no_match_text = "No Match" if language == "English" else "未匹配"
        return no_match_text

    # 基础系统提示词
    if language == "English":
        system_prompt = """You are a document classification expert. Please analyze the provided information and select the most appropriate classification name from the given classification list.

CRITICAL REQUIREMENT: All output must be in English only, regardless of the document content language.

If there is no suitable classification, return "No Match". Only return one classification name, do not include any other text."""
        
        # 根据参考项调整提示词
        if not use_content and use_filename:
            system_prompt += """

Note: Currently classifying based on filename only, please focus on analyzing key information in the filename to select the most suitable classification."""
        elif use_filename and use_content:
            system_prompt += """

Note: Currently referencing both filename and file content for classification matching, the filename provides important classification clues, please focus on it."""
    else:
        system_prompt = """你是一个文档分类专家。请分析提供的信息，并从给定的分类列表中选择最合适的分类名称。

如果没有合适的分类，返回"未匹配"。只返回一个分类名称，不要有任何其他文本。"""

        # 根据参考项调整提示词
        if not use_content and use_filename:
            system_prompt += """

注意：当前仅根据文件名进行分类匹配，请重点分析文件名中的关键信息来选择最适合的分类。"""
        elif use_filename and use_content:
            system_prompt += """

注意：当前同时参考文件名和文件内容进行分类匹配，文件名提供了重要的分类线索，请重点考虑。"""

    # 如果有自定义提示词，添加到用户提示中而不是系统提示
    if language == "English":
        user_prompt = "Analysis information:\n{content}\n\nAvailable classification list:\n{categories}"
        if custom_prompt:
            user_prompt = f"Please classify according to the following requirements:\n{custom_prompt}\n\n{user_prompt}"
    else:
        user_prompt = "分析信息：\n{content}\n\n可选的分类列表：\n{categories}"
        if custom_prompt:
            user_prompt = f"请按照以下要求进行分类：\n{custom_prompt}\n\n{user_prompt}"
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("user", user_prompt)
    ])

    try:
        # 检查是否为Ollama模型
        model_type = llm.__class__.__name__
        is_ollama = model_type == "Ollama"
        
        # 使用429错误重试机制包装LLM调用
        def call_match_model():
            chain = prompt | llm
            return chain.invoke({
                "content": analysis_content,
                "categories": "\n".join(existing_categories)
            })
        
        result = retry_on_rate_limit(call_match_model, max_retries=3, base_delay=2)
        return clean_llm_response(result.content)
    except Exception as e:
        error_msg = str(e).lower()
        is_rate_limit_error = (
            "429" in error_msg or 
            "too many requests" in error_msg or 
            "rate limit" in error_msg
        )
        
        if is_rate_limit_error:
            logger.error(f"匹配分类时遇到速率限制错误，重试失败: {str(e)}")
        else:
            logger.error(f"匹配分类时出错: {str(e)}")
        no_match_text = "No Match" if language == "English" else "未匹配"
        return no_match_text


def get_existing_categories(output_dir: str) -> List[str]:
    """获取输出目录下的所有分类目录"""
    categories = []
    try:
        for item in os.listdir(output_dir):
            item_path = os.path.join(output_dir, item)
            if os.path.isdir(item_path):
                categories.append(item)
    except Exception as e:
        logger.error(f"获取已有分类时出错: {str(e)}")
    return categories


def get_file_info(file_path: str) -> dict:
    """获取文件的基本信息"""
    file_stat = os.stat(file_path)
    return {
        "content": "",  # 内容会在需要时单独获取
        "filetype": Path(file_path).suffix.lower(),
        "filesize": file_stat.st_size,
        "ctime": file_stat.st_ctime,
        "filename": Path(file_path).stem
    }


def create_llm(llm_type: LLMType = "openai",
               model_name: str = "gpt-3.5-turbo",
               api_base: Optional[str] = None,
               api_key: Optional[str] = None,
               temperature: float = 0) -> ChatOpenAI:
    """创建LLM实例

    Args:
        llm_type: LLM类型，可选值：openai、ollama、custom
        model_name: 模型名称
        api_base: API基础URL（用于custom类型）
        api_key: API密钥
        temperature: 温度参数

    Returns:
        LLM实例
    """
    if llm_type == "openai":
        # 使用OpenAI API
        return ChatOpenAI(
            model_name=model_name,
            temperature=temperature,
            api_key=api_key
        )
    elif llm_type == "ollama":
        logger.info("使用Ollama本地模型")
        # 使用Ollama本地模型
        return Ollama(
            model=model_name,
            temperature=temperature,
            callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
        )
    elif llm_type == "custom":
        # 使用自定义API端点
        if not api_base:
            raise ValueError("使用custom类型时必须提供api_base")
        if api_base.find('localhost:11434') > 0 or  api_base.find('127.0.0.1:11434') > 0:
            os.environ["http_proxy"] = "http://127.0.0.1:11434"
            os.environ["https_proxy"] = "http://127.0.0.1:11434"
        return ChatOpenAI(
            model_name=model_name,
            temperature=temperature,
            api_key=api_key,
            base_url=api_base
        )
    else:
        raise ValueError(f"不支持的LLM类型: {llm_type}")



def process_single_file(file_info: dict, llm: ChatOpenAI, api_key: str, is_local: bool,
                        match_existing: bool, existing_categories: List[str],
                        custom_prompt: str, image_recognition_method: ImageRecognitionMethod,
                        processed_files: set, processed_files_lock: threading.Lock,
                        progress_callback: Optional[Callable] = None,
                        total_files: int = 0,
                        output_base_dir: str = None,
                        reference_items: dict = None,
                        max_retries: int = 3,
                        retry_delay: float = 1.0,
                        language: str = "Chinese",
                        enable_cache: bool = True,
                        online_model: str = None,
                        api_base: str = None,
                        model_name: str = None,
                        moonshot_key: Optional[str] = None) -> dict:
    """处理单个文件，返回分类结果而不是直接复制文件

    Args:
        file_info: 文件信息字典
        llm: LLM模型
        api_key: API key
        match_existing: 是否匹配已有分类目录
        existing_categories: 现有分类列表
        custom_prompt: 用户自定义提示词
        image_recognition_method: 图片识别方法
        processed_files: 已处理文件集合
        processed_files_lock: 文件集合的线程锁
        progress_callback: 进度更新回调函数
        total_files: 总文件数
        output_base_dir: 输出基础目录路径，用于检查文件是否已经被分类
        reference_items: 参考项设置，包含是否参考内容、文件名等信息
        max_retries: 最大重试次数
        retry_delay: 重试延迟时间（秒）

    Returns:
        dict: 包含文件信息和分类结果的字典，如果处理失败则返回包含错误信息的字典
    """
    global processed_count
    # 获取文件信息
    file_path = file_info["file_path"]
    file = file_info["file"]
    file_extension = Path(file).suffix.lower()
    
    # 标记是否需要更新进度
    should_update_progress = True
    last_error = None
    
    # 缓存检查
    if enable_cache:
        try:
            from lib.async_scanner import cache_manager
            
            # 生成缓存键
            cache_key = cache_manager.get_cache_key(file_path)
            cached_result = cache_manager.get_cached_result(cache_key)
            
            if cached_result:
                logger.info(f"使用缓存结果: {file}")
                # 更新进度
                with processed_files_lock:
                    processed_count += 1
                    if progress_callback and total_files > 0:
                        progress_callback(processed_count)
                
                return cached_result
                
        except Exception as cache_error:
            logger.warning(f"缓存检查失败: {str(cache_error)}")
    
    try:
        # 检查文件是否已经被处理过（仅在当前会话中）
        with processed_files_lock:
            if file in processed_files:
                logger.info(f"文件 {file} 已经被分类到其他目录，跳过重复分类")
                # 文件已经被处理过，不需要再更新进度
                should_update_progress = False
                return {
                    "file_info": file_info,
                    "skip": True,  # 添加跳过标识
                    "reason": "文件已经被分类到其他目录"
                }
        
        # 基本文件检查
        if not os.path.exists(file_path):
            logger.warning(f"文件不存在: {file_path}")
            return {
                "file_info": file_info,
                "error": "文件不存在"
            }
        
        # 检查文件大小（避免处理过大的文件）
        try:
            file_size = os.path.getsize(file_path)
            if file_size > 100 * 1024 * 1024:  # 100MB
                logger.warning(f"文件过大，跳过处理: {file} ({file_size / 1024 / 1024:.1f}MB)")
                return {
                    "file_info": file_info,
                    "error": f"文件过大 ({file_size / 1024 / 1024:.1f}MB)"
                }
        except Exception as size_error:
            logger.warning(f"无法获取文件大小: {file}, 错误: {str(size_error)}")
        
        # 添加临时处理临时文件的逻辑
        if file.startswith('~$') or file.startswith('.~'):
            logger.info(f"警告：文件 {file} 是临时文件，跳过处理")
            return {
                "file_info": file_info,
                "error": "临时文件，跳过处理"
            }

        # 检查参考项设置
        use_content = True
        use_filename = False
        if reference_items:
            use_content = reference_items.get('content', True)
            use_filename = reference_items.get('filename', False)

        # 根据参考项设置决定是否需要加载文件内容
        content = ""
        if use_content:
            # 加载文档内容，减少重试次数避免长时间卡住
            max_load_retries = 1  # 从3次减少到1次，快速失败
            for attempt in range(max_load_retries):
                try:
                    # 🚀 优化：使用带超时的文档加载函数，大幅缩短超时时间
                    docs = load_document_with_timeout(
                        file_path=file_path, 
                        max_chars=5000, 
                        api_key=api_key, 
                        is_local=is_local, 
                        image_recognition_method=image_recognition_method,
                        timeout=30,   # 🚀 从120秒大幅缩短到30秒，避免卡死
                        use_dynamic_timeout=True,  # 启用动态超时计算
                        online_model=online_model,  # 🚀 传递online_model参数
                        api_base=api_base,  # 🚀 传递api_base参数
                        model_name=model_name,  # 🚀 传递model_name参数
                        moonshot_key=moonshot_key  # 🚀 传递moonshot_key参数用于文件解析
                    )
                    if docs:
                        content = docs[0].page_content
                        break
                    else:
                        logger.info(f"警告：无法加载文件 {file} 的内容 (尝试 {attempt + 1}/{max_load_retries})")
                        if attempt == max_load_retries - 1:  # 最后一次尝试
                            if not use_filename:
                                return {
                                    "file_info": file_info,
                                    "error": "无法加载文件内容且未启用文件名参考"
                                }
                except FileLoadTimeoutException as timeout_error:
                    logger.error(f"文件 {file} 加载超时: {str(timeout_error)}")
                    # 对于超时的文件，直接返回错误，不再重试
                    return {
                        "file_info": file_info,
                        "error": f"文件加载超时：{str(timeout_error)}"
                    }
                except KeyboardInterrupt:
                    # 用户中断，向上传播
                    raise
                except Exception as doc_error:
                    last_error = doc_error
                    logger.error(f"加载文件 {file} 内容失败 (尝试 {attempt + 1}/{max_load_retries}): {str(doc_error)}")
                    if attempt < max_load_retries - 1:
                        time.sleep(retry_delay * (attempt + 1))  # 递增延迟
                    else:  # 最后一次尝试失败
                        if not use_filename:
                            return {
                                "file_info": file_info,
                                "error": f"加载文件内容失败且未启用文件名参考: {str(doc_error)}"
                            }
                        # 如果启用了文件名参考，即使加载内容失败也可以继续
                        content = ""
        
        # 获取文件名（不包含扩展名）
        filename = Path(file_path).stem if use_filename else None

        # 根据模式选择分类方式，使用更短的看门狗超时
        classification = None
        max_classify_retries = 2  # 从3次减少到2次，快速失败
        for attempt in range(max_classify_retries):
            try:
                # 启动看门狗监控整个分类过程，缩短超时时间
                watchdog_timeout = 45  # 从300秒减少到45秒
                watchdog = WatchdogTimer(watchdog_timeout, lambda: logger.error(f"文件 {file} 分类过程被看门狗监控到超时"))
                watchdog.start()
                
                try:
                    if match_existing:
                        # 使用安全的匹配函数
                        matched_category = safe_match_existing_categories(
                            content, existing_categories, llm, custom_prompt, 
                            filename, reference_items, language
                        )
                        if matched_category == "未匹配" or matched_category == "No Match":
                            logger.info(f"警告：文件 {file} 无法匹配到任何已有分类，将使用默认分类方式")
                            # 使用安全的分类函数
                            classification = safe_classify_document(
                                content, llm, custom_prompt, filename, reference_items, language
                            )
                        else:
                            classification = FileClassification(
                                category=matched_category,
                                subcategory="general",
                                confidence=1.0
                            )
                    else:
                        # 使用安全的分类函数
                        classification = safe_classify_document(
                            content, llm, custom_prompt, filename, reference_items, language
                        )
                finally:
                    # 确保停止看门狗
                    watchdog.stop()
                
                # 如果成功获得分类结果，跳出重试循环
                if classification and hasattr(classification, 'category') and classification.category:
                    break
                else:
                    logger.warning(f"文件 {file} 的分类结果无效 (尝试 {attempt + 1}/{max_classify_retries})")
                    if attempt < max_classify_retries - 1:
                        time.sleep(retry_delay * (attempt + 1))
                        
            except TimeoutError as timeout_error:
                # 超时错误，直接跳过不再重试
                logger.error(f"文件 {file} 分类超时，跳过此文件: {str(timeout_error)}")
                return {
                    "file_info": file_info,
                    "error": f"文件分类超时: {str(timeout_error)}"
                }
            except Exception as classify_error:
                last_error = classify_error
                logger.error(f"分类文件 {file} 时出错 (尝试 {attempt + 1}/{max_classify_retries}): {str(classify_error)}")
                if attempt < max_classify_retries - 1:
                    time.sleep(retry_delay * (attempt + 1))
                
        # 确保分类结果有效
        if not classification or not hasattr(classification, 'category') or not classification.category:
            logger.warning(f"文件 {file} 的分类结果无效，使用默认分类")
            if language == "English":
                classification = FileClassification(
                    category="Unclassified",
                    subcategory="Pending",
                    confidence=0.0
                )
            else:
                classification = FileClassification(
                    category="未分类",
                    subcategory="待处理",
                    confidence=0.0
                )

        # 确保添加处理文件到已处理集合
        with processed_files_lock:
            processed_files.add(file)
        
        # 构建返回结果
        result = {
            "file_info": file_info,
            "classification": classification
        }
        
        # 保存到缓存
        if enable_cache:
            try:
                from lib.async_scanner import cache_manager
                cache_key = cache_manager.get_cache_key(file_path)
                cache_manager.save_result(cache_key, result)
            except Exception as cache_error:
                logger.warning(f"保存缓存失败: {str(cache_error)}")
            
        # 返回分类结果
        return result

    except Exception as e:
        last_error = e
        logger.error(f"处理文件 {file} 时出现严重错误: {str(e)}")
        return {
            "file_info": file_info,
            "error": str(e)
        }
    finally:
        # 只有当需要更新进度，并且当前文件真的被处理了时才增加计数
        if should_update_progress:
            with processed_files_lock:
                processed_count += 1
                if progress_callback and total_files > 0:
                    try:
                        progress_callback(processed_count)
                        logger.info(f"进度更新: {processed_count}/{total_files}")
                    except Exception as progress_error:
                        logger.error(f"更新进度时出错: {str(progress_error)}")


def process_directory_with_custom_prompt_priority(files_to_process: List[dict], output_base_dir: str, is_local: bool = False, confidence_threshold: float = 0.8,
                      match_existing: bool = False, reference_items: dict = None, custom_prompt: str = None,
                      image_recognition_method: ImageRecognitionMethod = "tesseract",
                      llm_type: LLMType = "openai",
                      model_name: str = "gpt-3.5-turbo",
                      api_base: Optional[str] = None,
                      api_key: Optional[str] = None,
                      max_workers: int = 4,
                      tesseract_path: Optional[str] = None,
                      progress_callback: Optional[Callable] = None,
                      enable_aggregation: bool = True,
                      max_categories: int = 8,
                      enable_deduplication: bool = False,
                      language: str = "Chinese",
                      enable_custom_prompt_priority: bool = True,
                      online_model: str = None,
                      moonshot_key: Optional[str] = None) -> dict:
    """处理目录中的文件 - 自定义提示词优先级版本
    
    这个函数是原始process_directory的优化版本，专门为自定义提示词提供最高优先级处理。
    当启用自定义提示词优先级时，AI将严格按照用户的自定义要求进行文件分类。
    
    Args:
        enable_custom_prompt_priority: 是否启用自定义提示词优先级模式
        其他参数与原始process_directory函数相同
    """
    
    # 如果启用了自定义提示词优先级且有自定义提示词
    if enable_custom_prompt_priority and custom_prompt and custom_prompt.strip():
        logger.info(f"🎯 启用自定义提示词优先级模式: {custom_prompt}")
        return _process_directory_with_priority(
            files_to_process, output_base_dir, is_local, confidence_threshold,
            match_existing, reference_items, custom_prompt, image_recognition_method,
            llm_type, model_name, api_base, api_key, max_workers, tesseract_path,
            progress_callback, enable_aggregation, max_categories, enable_deduplication, language, online_model,
            moonshot_key  # 🚀 传递 moonshot_key 参数
        )
    else:
        # 回退到原始处理方式
        logger.info("📋 使用标准分类模式")
        return process_directory(
            files_to_process, output_base_dir, is_local, confidence_threshold,
            match_existing, reference_items, custom_prompt, image_recognition_method,
            llm_type, model_name, api_base, api_key, max_workers, tesseract_path,
            progress_callback, enable_aggregation, max_categories, enable_deduplication, language,
            moonshot_key  # 🚀 传递 moonshot_key 参数
        )


def _process_directory_with_priority(files_to_process: List[dict], output_base_dir: str, 
                                    is_local: bool, confidence_threshold: float,
                                    match_existing: bool, reference_items: dict,
                                    custom_prompt: str, image_recognition_method: str,
                                    llm_type: str, model_name: str, api_base: Optional[str],
                                    api_key: Optional[str], max_workers: int,
                                    tesseract_path: Optional[str], progress_callback: Optional[Callable],
                                    enable_aggregation: bool, max_categories: int,
                                    enable_deduplication: bool, language: str, online_model: str = None,
                                    moonshot_key: Optional[str] = None) -> dict:
    """自定义提示词优先级的目录处理函数"""
    
    import threading
    from concurrent.futures import ThreadPoolExecutor, as_completed
    from collections import defaultdict
    
    # 创建LLM实例 - 🔧 修复：根据用户的实际配置创建LLM
    try:
        # 🎯 重要修复：不要改变用户的LLM类型配置
        # 用户通过接口获取的第三方API都是兼容OpenAI格式的，应该使用原始的llm_type
        logger.info(f"🎯 自定义提示词优先级模式 - 使用用户配置的LLM类型: {llm_type}")
        logger.info(f"🎯 API地址: {api_base}")
        logger.info(f"🎯 模型名称: {model_name}")
        
        llm = create_llm(
            llm_type=llm_type,  # 🔧 直接使用用户配置的类型，不要自作主张修改
            model_name=model_name,
            api_base=api_base,
            api_key=api_key
        )
        logger.info(f"🎯 自定义提示词优先级模式LLM创建成功: {llm_type}/{model_name}")
    except Exception as llm_error:
        logger.error(f"🎯 自定义提示词优先级模式LLM创建失败: {str(llm_error)}")
        # 如果LLM创建失败，回退到标准处理方式
        logger.info("回退到标准分类处理方式")
        from lib.ai import process_directory
        return process_directory(
            files_to_process, output_base_dir, is_local, confidence_threshold,
            match_existing, reference_items, custom_prompt, image_recognition_method,
            llm_type, model_name, api_base, api_key, max_workers, tesseract_path,
            progress_callback, enable_aggregation, max_categories, enable_deduplication, language,
            moonshot_key  # 🚀 传递 moonshot_key 参数
        )
    
    # 获取现有分类（如果启用）
    existing_categories = []
    if match_existing and os.path.exists(output_base_dir):
        try:
            existing_categories = [
                d for d in os.listdir(output_base_dir) 
                if os.path.isdir(os.path.join(output_base_dir, d)) and not d.startswith('.')
            ]
            logger.info(f"🎯 找到现有分类供自定义提示词参考: {existing_categories}")
        except Exception as e:
            logger.warning(f"获取现有分类失败: {str(e)}")
    
    # 初始化结果结构
    organized_files = {
        "categories": {},
        "unclassified": []
    }
    
    # 线程安全的结果处理
    results_lock = threading.Lock()
    processed_files = set()
    processed_files_lock = threading.Lock()
    
    def process_single_file_with_priority(file_info: dict) -> dict:
        """处理单个文件的自定义提示词优先级版本"""
        
        file_path = file_info["file_path"]
        file_name = file_info["file"]
        
        try:
            # 检查文件是否已经被处理过
            with processed_files_lock:
                if file_name in processed_files:
                    return {"file_info": file_info, "error": "文件已被处理"}
            
            # 加载文档内容
            content = ""
            try:
                docs = load_document_with_timeout(
                    file_path=file_path, 
                    max_chars=5000, 
                    api_key=api_key, 
                    is_local=is_local, 
                    image_recognition_method=image_recognition_method,
                    timeout=120,
                    use_dynamic_timeout=True,
                    online_model=online_model,  # 🚀 传递online_model参数
                    api_base=api_base,  # 🚀 传递api_base参数
                    model_name=model_name,  # 🚀 传递model_name参数
                    moonshot_key=moonshot_key  # 🚀 传递moonshot_key参数用于文件解析
                )
                if docs:
                    content = docs[0].page_content
            except Exception as load_error:
                logger.warning(f"加载文件内容失败，仅使用文件名: {file_name}")
                content = ""
            
            # 🎯 使用自定义提示词优先级分类
            filename = os.path.basename(file_path)
            classification = classify_document_with_custom_prompt_priority(
                content=content,
                llm=llm,
                custom_prompt=custom_prompt,
                filename=filename,
                reference_items=reference_items,
                language=language
            )
            
            if classification and classification.confidence >= confidence_threshold:
                # 标记文件为已处理
                with processed_files_lock:
                    processed_files.add(file_name)
                
                result = {
                    "file_info": file_info,
                    "classification": classification,
                    "content": content[:500] if content else "",
                    "custom_prompt_used": True  # 🎯 标记使用了自定义提示词
                }
                
                logger.info(f"🎯 自定义提示词分类完成: {file_name} -> {classification.category} (置信度: {classification.confidence:.2f})")
                return result
            else:
                logger.warning(f"自定义提示词分类置信度不足: {file_name} (置信度: {classification.confidence if classification else 0:.2f})")
                return {"file_info": file_info, "error": f"分类置信度不足 ({classification.confidence if classification else 0:.2f})"}
                
        except Exception as e:
            logger.error(f"自定义提示词处理文件失败: {file_name} - {str(e)}")
            return {"file_info": file_info, "error": str(e)}
    
    # 使用线程池处理文件
    logger.info(f"🎯 开始使用自定义提示词优先级处理 {len(files_to_process)} 个文件")
    
    # 🔧 添加进度跟踪
    total_files_count = len(files_to_process)
    processed_count = 0
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 提交所有任务
        future_to_file = {
            executor.submit(process_single_file_with_priority, file_info): file_info 
            for file_info in files_to_process
        }
        
        # 🚀 优化：处理完成的任务，添加超时保护
        for future in as_completed(future_to_file):
            try:
                result = future.result(timeout=90)  # 🚀 添加90秒超时
                file_info = future_to_file[future]
            except TimeoutError:
                file_info = future_to_file[future]
                logger.warning(f"文件处理超时，快速跳过: {file_info.get('file', 'unknown')}")
                future.cancel()
                # 🔧 超时也计入已处理
                processed_count += 1
                if progress_callback:
                    try:
                        progress_callback(processed_count)
                    except Exception as e:
                        logger.error(f"进度回调出错: {str(e)}")
                continue
            except Exception as e:
                file_info = future_to_file[future]
                logger.error(f"处理文件时出现异常 {file_info.get('file', 'unknown')}: {str(e)}")
                # 🔧 异常也计入已处理
                processed_count += 1
                if progress_callback:
                    try:
                        progress_callback(processed_count)
                    except Exception as e:
                        logger.error(f"进度回调出错: {str(e)}")
                continue
            
            # 🔧 成功处理，更新进度
            processed_count += 1
            if progress_callback:
                try:
                    progress_callback(processed_count)
                    logger.info(f"🎯 进度更新: {processed_count}/{total_files_count} ({int(processed_count/total_files_count*100)}%)")
                except Exception as e:
                    logger.error(f"进度回调出错: {str(e)}")
            
            if "error" not in result and "classification" in result:
                classification = result["classification"]
                category = classification.category
                subcategory = classification.subcategory or "general"
                file_path = file_info["file_path"]
                file_name = file_info["file"]
                
                # 清理分类名和子分类名，移除Windows不允许的字符
                from lib.common import Common
                category = Common.sanitize_filename(category)
                subcategory = Common.sanitize_filename(subcategory)
                
                with results_lock:
                    # 确保类别结构存在
                    if category not in organized_files["categories"]:
                        organized_files["categories"][category] = {"subcategories": {}}
                    if subcategory not in organized_files["categories"][category]["subcategories"]:
                        organized_files["categories"][category]["subcategories"][subcategory] = {"files": []}
                    
                    # 添加文件信息（包含original_path字段）
                    organized_files["categories"][category]["subcategories"][subcategory]["files"].append({
                        "original_path": file_path,
                        "filename": file_name,
                        "confidence": classification.confidence,
                        "custom_prompt_used": result.get("custom_prompt_used", False)
                    })
            else:
                # 分类失败的文件
                file_path = file_info["file_path"]
                file_name = file_info["file"]
                error = result.get("error", "未知错误")
                
                with results_lock:
                    organized_files["unclassified"].append({
                        "original_path": file_path,
                        "filename": file_name,
                        "error": error
                    })
    
    logger.info(f"🎯 自定义提示词优先级处理完成: {len(organized_files['categories'])} 个分类")
    
    # 添加处理信息
    organized_files["processing_info"] = {
        "mode": "custom_prompt_priority",
        "custom_prompt": custom_prompt,
        "total_files": len(files_to_process),
        "classified_files": sum(len(files) for category in organized_files["categories"].values() for files in category.values()),
        "unclassified_files": len(organized_files["unclassified"])
    }
    
    return organized_files


def process_directory(files_to_process: List[dict], output_base_dir: str, is_local: bool = False, confidence_threshold: float = 0.8,
                      match_existing: bool = False, reference_items: dict = None, custom_prompt: str = None,
                      image_recognition_method: ImageRecognitionMethod = "tesseract",
                      llm_type: LLMType = "openai",
                      model_name: str = "gpt-3.5-turbo",
                      api_base: Optional[str] = None,
                      api_key: Optional[str] = None,
                      max_workers: int = 4,
                      tesseract_path: Optional[str] = None,
                      progress_callback: Optional[Callable] = None,
                      enable_aggregation: bool = True,
                      max_categories: int = 8,
                      enable_deduplication: bool = False,
                      language: str = "Chinese",
                      moonshot_key: Optional[str] = None):
    """处理多个目录中的所有支持的文件，返回整理后的文件结构字典，而不是直接移动或复制文件

    Args:
        input_files: 输入目录路径列表
        output_base_dir: 输出目录路径
        confidence_threshold: 置信度阈值
        is_local:是否是本地模型
        match_existing: 是否匹配已有分类目录
        reference_items: 分类参考项字典，key为参考项名称，value为是否使用该参考项
        custom_prompt: 用户自定义的提示词
        image_recognition_method: 图片识别方法，可选值：tesseract、llava、both
        llm_type: LLM类型，可选值：openai、ollama、custom
        model_name: 模型名称
        api_base: API基础URL（用于custom类型）
        api_key: API密钥
        max_workers: 最大线程数
        tesseract_path: Tesseract可执行文件路径
        progress_callback: 进度更新回调函数
        enable_aggregation: 是否启用智能聚合功能
        max_categories: 最大类别数量，超过此数量将进行聚合
        
    Returns:
        dict: 整理后的文件结构字典，格式为：
        {
            "categories": {
                "category_name": {
                    "subcategories": {
                        "subcategory_name": {
                            "files": [
                                {
                                    "original_path": "原始文件路径",
                                    "filename": "文件名",
                                    "confidence": 置信度
                                },
                                ...
                            ]
                        },
                        ...
                    }
                },
                ...
            },
            "unclassified": [
                {
                    "original_path": "原始文件路径",
                    "filename": "文件名",
                    "error": "错误信息"
                },
                ...
            ]
        }
    """
    global processed_count
    processed_count = 0
    
    # 文件去重处理
    if enable_deduplication:
        logger.info("启用文件去重功能")
        files_to_process = deduplicate_files_by_content(files_to_process, progress_callback)
    else:
        logger.info("文件去重功能已禁用")
    
    if not files_to_process:
        logger.warning("没有文件需要处理")
        return {"categories": {}, "unclassified": []}
    
    if tesseract_path:
        logger.info("存在tesseract_path {}".format(tesseract_path))
        pytesseract.pytesseract.tesseract_cmd = tesseract_path
    # 初始化LLM，使用增强的容错版本
    try:
        llm = create_robust_llm(
            llm_type=llm_type,
            model_name=model_name,
            api_base=api_base,
            api_key=api_key,
            max_retries=3,
            timeout=120
        )
        logger.info(f"LLM初始化成功: {llm_type}/{model_name}")
    except Exception as llm_error:
        logger.error(f"LLM初始化失败: {str(llm_error)}")
        # 如果LLM初始化失败，返回空结果
        return {
            "categories": {},
            "unclassified": [
                {
                    "original_path": file_info["file_path"],
                    "filename": file_info["file"],
                    "error": f"LLM初始化失败: {str(llm_error)}"
                } for file_info in files_to_process
            ]
        }

    # 创建输出基础目录（如果不存在）
    os.makedirs(output_base_dir, exist_ok=True)

    # 用于跟踪已处理的文件
    processed_files = set()
    processed_files_lock = threading.Lock()
    
    # 用于存储整理后的文件结构
    organized_files = {
        "categories": {},
        "unclassified": []
    }
    # 结果处理锁，用于同步更新organized_files
    results_lock = threading.Lock()

    # 如果启用匹配已有分类，获取现有分类列表
    existing_categories = []
    if match_existing:
        existing_categories = get_existing_categories(output_base_dir)
        if not existing_categories:
            logger.info("警告：输出目录下没有找到任何分类目录，将使用默认分类方式")
            match_existing = False  

    # 收集所有需要处理的文件
    total_files = len(files_to_process)

    if not files_to_process:
        logger.info("没有找到需要处理的文件")
        if progress_callback:
            progress_callback(0)
        return organized_files

    logger.info(f"\n共找到 {total_files} 个文件需要处理")

    # 创建一个线程安全的队列来存储要处理的文件
    from queue import Queue
    file_queue = Queue()
    for file_info in files_to_process:
        file_queue.put(file_info)
    
    # 处理结果的函数
    def handle_result(result):
        if not result:
            logger.warning("处理文件时返回了空结果")
            return
        
        with results_lock:
            file_info = result["file_info"]
            file_path = file_info["file_path"]
            file_name = file_info["file"]
            
            # 检查是否应该跳过该文件
            if result.get("skip", False):
                reason = result.get("reason", "未知原因")
                logger.debug(f"跳过文件 {file_name}: {reason}")
                return  # 直接返回，不添加到任何分类中
            
            # 检查结果中是否有classification键
            if "classification" in result:
                # 将文件添加到对应的类别和子类别中
                classification = result["classification"]
                category = classification.category
                subcategory = classification.subcategory
                confidence = classification.confidence
                
                # 清理分类名和子分类名，移除Windows不允许的字符
                from lib.common import Common
                category = Common.sanitize_filename(category)
                subcategory = Common.sanitize_filename(subcategory)
                
                # 标准化子类别名称：如果为空或无效，设置为"general"
                if not subcategory or subcategory.strip() == "":
                    subcategory = "general"
                    logger.info(f"子类别名称为空，已标准化为'general'")
                
                # 确保类别存在
                if category not in organized_files["categories"]:
                    organized_files["categories"][category] = {
                        "subcategories": {}
                    }
                
                # 确保子类别存在
                if subcategory not in organized_files["categories"][category]["subcategories"]:
                    organized_files["categories"][category]["subcategories"][subcategory] = {
                        "files": []
                    }
                
                # 添加文件信息
                organized_files["categories"][category]["subcategories"][subcategory]["files"].append({
                    "original_path": file_path,
                    "filename": file_name,
                    "confidence": confidence
                })
                
                logger.info(f"已将文件 {file_name} 分类到 {category}/{subcategory} 类别 (置信度: {confidence:.2f})")
            else:
                # 处理失败的文件
                error = result.get("error", "未知错误")
                
                organized_files["unclassified"].append({
                    "original_path": file_path,
                    "filename": file_name,
                    "error": error
                })
                
                logger.info(f"处理文件 {file_name} 时出错: {error}")
    
    # 错误统计
    error_stats = {
        "network_errors": 0,
        "file_errors": 0,
        "ai_errors": 0,
        "other_errors": 0,
        "consecutive_failures": 0,
        "max_consecutive_failures": 15  # 连续失败阈值，适当提高
    }
    stats_lock = threading.Lock()
    
    # 断路器状态 - 优化设置避免过早中断
    circuit_breaker = {
        "is_open": False,
        "failure_count": 0,
        "last_failure_time": 0,
        "timeout": 20,  # 断路器打开后的等待时间（秒），从30减少到20
        "error_rate_threshold": 0.6,  # 错误率阈值，从0.5提高到0.6
        "min_samples": 8,  # 最少样本数，从5提高到8
        "network_errors_threshold": 8,  # 网络错误阈值，从5提高到8
        "file_errors_threshold": 12,   # 文件错误阈值，从8提高到12
        "ai_errors_threshold": 6       # AI错误阈值，从4提高到6
    }
    
    def should_skip_due_to_circuit_breaker():
        """检查是否应该因为断路器而跳过处理"""
        with stats_lock:
            if circuit_breaker["is_open"]:
                if time.time() - circuit_breaker["last_failure_time"] > circuit_breaker["timeout"]:
                    # 重置断路器
                    circuit_breaker["is_open"] = False
                    circuit_breaker["failure_count"] = 0
                    logger.info("断路器已重置，恢复处理")
                    return False
                else:
                    return True
            return False
    
    def update_error_stats(error_type: str, is_success: bool = False):
        """更新错误统计"""
        with stats_lock:
            if is_success:
                error_stats["consecutive_failures"] = 0
                # 成功处理时，减少失败计数，有助于断路器恢复
                if circuit_breaker["failure_count"] > 0:
                    circuit_breaker["failure_count"] = max(0, circuit_breaker["failure_count"] - 2)
                
                # 如果断路器是打开的，但最近有成功处理，考虑提前重置
                if circuit_breaker["is_open"] and circuit_breaker["failure_count"] <= 5:
                    circuit_breaker["is_open"] = False
                    circuit_breaker["failure_count"] = 0
                    logger.info("断路器提前重置，因为最近处理成功且失败次数较少")
            else:
                error_stats[error_type] += 1
                error_stats["consecutive_failures"] += 1
                circuit_breaker["failure_count"] += 1
                circuit_breaker["last_failure_time"] = time.time()
                
                # 检查是否需要打开断路器
                # 更智能的触发条件：考虑错误率和文件数量
                total_processed = sum([error_stats[key] for key in error_stats if key.endswith('_errors')]) + (processed_count - sum([error_stats[key] for key in error_stats if key.endswith('_errors')]))
                error_rate = circuit_breaker["failure_count"] / max(1, total_processed) if total_processed > 0 else 0
                
                should_open_circuit = False
                if error_stats["consecutive_failures"] >= error_stats["max_consecutive_failures"]:
                    should_open_circuit = True
                    reason = f"连续失败次数过多: {error_stats['consecutive_failures']}"
                elif circuit_breaker["failure_count"] >= 30 and error_rate > 0.8:
                    should_open_circuit = True
                    reason = f"总失败次数过多且错误率过高: {circuit_breaker['failure_count']} 次失败, 错误率: {error_rate:.1%}"
                elif circuit_breaker["failure_count"] >= 80:  # 从50提高到80
                    should_open_circuit = True
                    reason = f"总失败次数过多: {circuit_breaker['failure_count']} (阈值: 80)"
                
                if should_open_circuit:
                    circuit_breaker["is_open"] = True
                    logger.warning(f"断路器已打开，暂停处理。原因: {reason}")

    # 工作线程函数
    def worker():
        thread_id = threading.current_thread().ident
        logger.info(f"工作线程 {thread_id} 启动")
        
        while not file_queue.empty():
            try:
                # 检查断路器状态
                if should_skip_due_to_circuit_breaker():
                    with stats_lock:
                        remaining_time = circuit_breaker["timeout"] - (time.time() - circuit_breaker["last_failure_time"])
                        logger.warning(f"线程 {thread_id}: 断路器已打开，跳过处理。剩余等待时间: {remaining_time:.1f}秒")
                    time.sleep(1)
                    continue
                
                try:
                    file_info = file_queue.get(block=False)  # 非阻塞获取，避免死锁
                except:
                    # 队列为空，退出循环
                    break
                
                try:
                    result = process_single_file(
                        file_info,
                        llm,
                        api_key,
                        is_local,
                        match_existing,
                        existing_categories,
                        custom_prompt,
                        image_recognition_method,
                        processed_files,
                        processed_files_lock,
                        progress_callback,
                        total_files,
                        output_base_dir,  # 传递输出目录路径
                        reference_items,
                        max_retries=3,
                        retry_delay=1.0,
                        language=language,
                        enable_cache=True,  # 启用缓存
                        online_model=model_name,  # 🚀 传递online_model参数
                        api_base=api_base,  # 🚀 传递api_base参数
                        moonshot_key=moonshot_key,  # 🚀 新增：传递 Moonshot API key 用于文件解析
                        model_name=model_name  # 🚀 传递model_name参数
                    )
                    
                    # 分析错误类型并更新统计
                    if "error" in result:
                        error_msg = result["error"].lower()
                        if any(keyword in error_msg for keyword in ["network", "timeout", "connection", "api"]):
                            update_error_stats("network_errors")
                        elif any(keyword in error_msg for keyword in ["file", "permission", "not found"]):
                            update_error_stats("file_errors")
                        elif any(keyword in error_msg for keyword in ["ai", "model", "classification"]):
                            update_error_stats("ai_errors")
                        else:
                            update_error_stats("other_errors")
                    else:
                        update_error_stats("", is_success=True)
                    
                    handle_result(result)
                    
                except Exception as process_error:
                    logger.error(f"线程 {thread_id} 处理文件时出错: {str(process_error)}")
                    update_error_stats("other_errors")
                    
                    # 创建错误结果
                    error_result = {
                        "file_info": file_info,
                        "error": f"处理异常: {str(process_error)}"
                    }
                    handle_result(error_result)
                    
                finally:
                    try:
                        file_queue.task_done()
                    except:
                        pass  # 忽略task_done的错误
                        
            except Exception as e:
                logger.error(f"工作线程 {thread_id} 出现严重错误: {str(e)}")
                update_error_stats("other_errors")
                # 短暂休息后继续
                time.sleep(0.5)
        
        logger.info(f"工作线程 {thread_id} 结束")
    
    # 启动工作线程
    start_time = time.time()
    threads = []
    
    # 智能计算线程数量，根据文件类型和数量优化（增强版）
    def calculate_optimal_workers(files_to_process, max_workers, is_local=False):
        """根据文件类型和系统配置智能计算线程数"""
        from lib.async_scanner import thread_manager
        
        total_files = len(files_to_process)
        
        # 统计文件类型
        simple_files = 0  # 简单文本文件
        complex_files = 0  # 复杂文件（需要OCR等）
        image_files = 0   # 图片文件
        
        simple_extensions = {'.txt', '.md', '.rtf', '.log', '.csv', '.json'}
        complex_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'}
        image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp', '.gif'}
        
        for file_info in files_to_process:
            ext = Path(file_info["file_path"]).suffix.lower()
            if ext in simple_extensions:
                simple_files += 1
            elif ext in complex_extensions:
                complex_files += 1
            elif ext in image_extensions:
                image_files += 1
            else:
                complex_files += 1  # 保守估计
        
        # 使用智能线程管理器计算最优线程数
        if is_local:
            # 本地模型：根据文件复杂度调整
            if image_files > total_files * 0.5:
                # 图片文件占多数，使用图片处理优化策略
                optimal_workers = thread_manager.get_workers_for_task("image_processing", total_files, "complex")
            elif simple_files > total_files * 0.7:
                # 简单文件占多数
                optimal_workers = thread_manager.get_workers_for_task("local_model", total_files, "simple")
            else:
                # 混合文件类型
                optimal_workers = thread_manager.get_workers_for_task("local_model", total_files, "medium")
        else:
            # 在线模型：可以使用更多线程
            optimal_workers = thread_manager.get_workers_for_task("online_model", total_files)
        
        # 应用文件数量调整
        if total_files <= 5:
            optimal_workers = 1
        elif total_files <= 10:
            optimal_workers = min(2, optimal_workers)
        elif total_files <= 50:
            optimal_workers = min(optimal_workers, max_workers)
        else:
            # 大批量文件，适当增加线程但避免过载
            optimal_workers = min(optimal_workers + 2, max_workers)
        
        logger.info(f"文件分析: 简单文件 {simple_files} 个，复杂文件 {complex_files} 个，图片文件 {image_files} 个")
        logger.info(f"推荐线程数: {optimal_workers} (系统最大: {max_workers})")
        
        return optimal_workers
    
    effective_max_workers = calculate_optimal_workers(files_to_process, max_workers, is_local)
    
    # 统计文件类型用于日志
    simple_files = sum(1 for f in files_to_process 
                      if Path(f["file_path"]).suffix.lower() in {'.txt', '.md', '.rtf'})
    complex_files = total_files - simple_files
    
    logger.info(f"文件分析: 简单文件 {simple_files} 个，复杂文件 {complex_files} 个")
    logger.info(f"启动 {effective_max_workers} 个工作线程处理 {total_files} 个文件")
    
    for i in range(effective_max_workers):
        t = threading.Thread(target=worker, name=f"Worker-{i+1}")
        t.daemon = True  # 设置为守护线程，这样主线程结束时它们会自动终止
        t.start()
        threads.append(t)
    
    # 监控处理进度和错误率，提供更及时的反馈
    monitor_interval = 5  # 🚀 修复：恢复到5秒，给复杂任务更多时间
    last_check_time = start_time
    last_processed_count = 0
    stuck_count = 0  # 连续无进度的次数
    max_stuck_cycles = 12  # 🚀 修复：增加到12个周期（60秒），避免误判Moonshot API处理
    
    try:
        # 等待队列处理完毕，但定期检查状态
        while not file_queue.empty():
            try:
                # 等待一小段时间
                time.sleep(min(monitor_interval, 1.0))
                
                current_time = time.time()
                if current_time - last_check_time >= monitor_interval:
                    # 检查处理进度
                    current_processed = processed_count
                    processing_rate = (current_processed - last_processed_count) / monitor_interval
                    
                    # 🚀 修复：更智能的停滞检测逻辑
                    if current_processed == last_processed_count and not file_queue.empty():
                        stuck_count += 1
                        
                        # 检查线程状态，只有在线程真正死掉时才报警
                        alive_threads = [t for t in threads if t.is_alive()]
                        if len(alive_threads) > 0:
                            # 线程还活着，可能在处理复杂任务（如Moonshot API），给更多耐心
                            logger.info(f"处理进度暂停 ({stuck_count}/{max_stuck_cycles}) - 线程正常运行中，可能在处理复杂文件")
                        else:
                            logger.warning(f"检测到处理停滞 ({stuck_count}/{max_stuck_cycles}) - 无活跃线程")
                        
                        if stuck_count >= max_stuck_cycles:
                            if len(alive_threads) == 0:
                                logger.error("所有工作线程已停止，尝试恢复...")
                                # 强制垃圾回收
                                gc.collect()
                                
                                # 重新启动线程，使用更保守的线程数
                                recovery_workers = 1  # 只启动1个恢复线程，避免再次阻塞
                                for i in range(recovery_workers):
                                    t = threading.Thread(target=worker, name=f"Recovery-Worker-{i+1}")
                                    t.daemon = True
                                    t.start()
                                    threads.append(t)
                                logger.info(f"重新启动了 {recovery_workers} 个恢复线程")
                                stuck_count = 0  # 重置计数
                            else:
                                # 线程还活着但长时间无进度，可能在处理超大文件
                                logger.warning(f"线程运行中但长时间无进度，可能在处理复杂文件（PDF/大图片）")
                                logger.info(f"活跃线程数: {len(alive_threads)}/{len(threads)}")
                                # 不重启线程，只是重置计数，给更多时间
                                stuck_count = max_stuck_cycles // 2  # 减少计数但不归零，避免无限等待
                    else:
                        stuck_count = 0  # 有进度，重置计数
                    
                    with stats_lock:
                        total_errors = sum([error_stats[key] for key in error_stats if key.endswith('_errors')])
                        error_rate = total_errors / max(1, current_processed) if current_processed > 0 else 0
                        
                        logger.info(f"处理进度: {current_processed}/{total_files} "
                                  f"(速度: {processing_rate:.1f}文件/秒, 错误率: {error_rate:.1%})")
                        
                        # 如果错误率过高，记录警告
                        if error_rate > 0.5 and current_processed > 10:
                            logger.warning(f"错误率较高 ({error_rate:.1%})，请检查网络连接和API配置")
                    
                    last_check_time = current_time
                    last_processed_count = current_processed
                    
            except KeyboardInterrupt:
                logger.info("收到中断信号，正在停止处理...")
                break
        
        # 等待队列完全处理完毕
        file_queue.join()
        
    except Exception as monitor_error:
        logger.error(f"监控过程中出错: {str(monitor_error)}")
    
    # 等待所有线程完成
    logger.info("等待所有工作线程完成...")
    # 🚀 优化：减少线程等待时间，快速失败
    for i, t in enumerate(threads):
        if t.is_alive():
            t.join(timeout=2.0)  # 从5秒减少到2秒
            if t.is_alive():
                logger.warning(f"线程 {t.name} 未能在2秒内完成，强制继续避免卡死")
    
    # 打印处理时间和统计信息
    end_time = time.time()
    
    with stats_lock:
        total_errors = sum([error_stats[key] for key in error_stats if key.endswith('_errors')])
        success_count = processed_count - total_errors
        
        logger.info(f"\n=== 处理完成统计 ===")
        logger.info(f"总耗时: {end_time - start_time:.2f} 秒")
        logger.info(f"总文件数: {total_files}")
        logger.info(f"成功处理: {success_count}")
        logger.info(f"处理失败: {total_errors}")
        logger.info(f"成功率: {success_count / max(1, total_files):.1%}")
        
        if total_errors > 0:
            logger.info(f"错误详情:")
            logger.info(f"  网络错误: {error_stats['network_errors']}")
            logger.info(f"  文件错误: {error_stats['file_errors']}")
            logger.info(f"  AI错误: {error_stats['ai_errors']}")
            logger.info(f"  其他错误: {error_stats['other_errors']}")
            
        if circuit_breaker["is_open"]:
            logger.warning("注意: 断路器处于打开状态，部分文件可能未被处理")
        
        logger.info("===================\n")
    
    # 合并相似类别
    logger.info("开始合并相似类别...")
    
    # 检查是否有多个类别需要合并
    categories_count = len(organized_files.get("categories", {}))
    if categories_count > 1:
        logger.info(f"检测到 {categories_count} 个类别，尝试合并相似类别")
        
        try:
            merged_files = merge_similar_categories(organized_files, 0.8, llm)

            # 记录合并后的类别数量
            merged_categories_count = len(merged_files.get("categories", {}))
            if merged_categories_count < categories_count:
                logger.info(f"合并后的类别数量: {merged_categories_count}（原数量: {categories_count}）")
                organized_files = merged_files
            else:
                logger.info("没有找到需要合并的类别")
        except Exception as e:
            logger.error(f"合并类别时出错: {str(e)}")
            logger.info("继续使用原始分类结果")
    
    # 智能聚合类别
    if enable_aggregation:
        logger.info("开始智能聚合类别...")
        try:
            aggregated_files = intelligent_aggregate_categories(organized_files, llm, max_categories, language)
            
            # 记录聚合后的类别数量
            aggregated_categories_count = len(aggregated_files.get("categories", {}))
            original_categories_count = len(organized_files.get("categories", {}))
            
            if aggregated_categories_count < original_categories_count:
                logger.info(f"聚合后的类别数量: {aggregated_categories_count}（原数量: {original_categories_count}）")
                organized_files = aggregated_files
            else:
                logger.info("类别数量未超过限制，无需聚合")
                
        except Exception as e:
            logger.error(f"智能聚合类别时出错: {str(e)}")
            logger.info("继续使用合并后的分类结果")
    else:
        logger.info("智能聚合功能已禁用")

    return organized_files


def test_custom_api(api_base: str, api_key: str, model_name: str = "gpt-3.5-turbo") -> tuple:
    """测试自定义AI API和API key是否正常工作

    Args:
        api_base: API基础URL
        api_key: API密钥
        model_name: 模型名称

    Returns:
        tuple: (success: bool, error_message: str) 测试是否成功和错误信息
    """
    try:
        logger.info(f"正在测试API连接...")
        logger.info(f"API基础URL: {api_base}")
        logger.info(f"模型名称: {model_name}")
        if api_base.find('localhost:11434') > 0 or  api_base.find('127.0.0.1:11434') > 0:
            os.environ["http_proxy"] = "http://127.0.0.1:11434"
            os.environ["https_proxy"] = "http://127.0.0.1:11434"

        # 创建测试用的LLM实例
        test_llm = ChatOpenAI(
            model_name=model_name,
            temperature=0,
            api_key=api_key,
            base_url=api_base
        )

        # 创建一个超级简单的测试提示，没有任何格式变量
        system_message = "你是一个AI助手。"
        user_message = "你好"
        
        test_prompt = ChatPromptTemplate.from_messages([
            ("system", system_message),
            ("user", user_message)
        ])

        # 发送测试请求
        logger.info("发送测试请求...")
        chain = test_prompt | test_llm
        response = chain.invoke({})
        logger.info(f"测试请求响应: {response}")

        # 检查响应
        if response and response.content:
            logger.info(f"API响应成功！")
            logger.info(f"响应内容: {response.content}")
            return True, "连接测试成功"
        else:
            logger.warn("API响应为空")
            return False, "API响应为空"

    except Exception as e:
        error_msg = str(e)
        logger.error(f"API测试失败: {error_msg}")
        return False, error_msg


def clean_llm_response(response: str) -> str:
    """清理LLM响应，移除思考过程和多余内容
    
    Args:
        response: 原始LLM响应
        
    Returns:
        str: 清理后的内容
    """
    if not response:
        return ""
    
    import re
    
    # 移除 <think> 标签及其内容
    response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL | re.IGNORECASE)
    
    # 移除其他可能的思考标签
    response = re.sub(r'<thinking>.*?</thinking>', '', response, flags=re.DOTALL | re.IGNORECASE)
    
    # 移除多余的换行符和空白字符
    response = response.strip()
    
    # 处理markdown代码块（用于JSON响应）
    if "```json" in response:
        response = response.split("```json")[1].split("```")[0].strip()
    elif "```" in response:
        response = response.split("```")[1].split("```")[0].strip()
    
    # 对于文件名类的响应（单行），只取第一个非空行
    lines = [line.strip() for line in response.split('\n') if line.strip()]
    if lines and not response.startswith('{'):  # 不是JSON的情况下才取第一行
        response = lines[0]
    
    # 移除可能的引号（仅对非JSON响应）
    if not response.startswith('{'):
        response = response.strip('"\'')
        # 移除开头可能的序号或标点
        response = re.sub(r'^[\d\.\-\*\+\s]*', '', response)
    
    return response.strip()


def _extract_filename_structure(filename: str) -> dict:
    """从文件名中提取结构化信息（日期、编号等）
    
    Args:
        filename: 不含扩展名的文件名
        
    Returns:
        dict: 包含提取的结构化信息
            - dates: 提取到的日期列表
            - numbers: 提取到的编号列表
            - has_date: 是否包含日期
            - date_prefix: 开头的日期（如果有）
            - remaining_text: 移除日期后的文本
    """
    import re
    
    result = {
        "dates": [],
        "numbers": [],
        "has_date": False,
        "date_prefix": None,
        "remaining_text": filename
    }
    
    # 日期模式（按优先级排序）
    date_patterns = [
        (r'^(\d{4}-\d{2}-\d{2})', 'YYYY-MM-DD'),  # 2024-01-29
        (r'^(\d{4}_\d{2}_\d{2})', 'YYYY_MM_DD'),  # 2024_01_29
        (r'^(\d{8})', 'YYYYMMDD'),                # 20240129
        (r'^(\d{4}\.\d{2}\.\d{2})', 'YYYY.MM.DD'), # 2024.01.29
        (r'^(\d{2}-\d{2}-\d{4})', 'DD-MM-YYYY'),  # 29-01-2024
    ]
    
    # 尝试匹配开头的日期
    for pattern, format_type in date_patterns:
        match = re.match(pattern, filename)
        if match:
            date_str = match.group(1)
            result["dates"].append(date_str)
            result["has_date"] = True
            result["date_prefix"] = date_str
            result["date_format"] = format_type
            # 移除日期后的文本（包括紧跟的分隔符）
            remaining = filename[len(date_str):]
            remaining = remaining.lstrip('-_. ')
            result["remaining_text"] = remaining
            logger.info(f"检测到日期前缀: {date_str} (格式: {format_type})")
            break
    
    # 提取所有数字编号
    numbers = re.findall(r'\d+', filename)
    if numbers:
        result["numbers"] = numbers
    
    return result


def rename_file_by_content(file_path: str, llm: ChatOpenAI, api_key: str, is_local: bool, custom_prompt: str = "",
                         max_chars: int = 5000, keep_extension: bool = True,
                         image_recognition_method: ImageRecognitionMethod = "tesseract", language: str = "Chinese", 
                         online_model: str = None, api_base: str = None, model_name: str = None, 
                         file_metadata: dict = None, moonshot_key: Optional[str] = None) -> str:
    """根据文件内容和自定义提示词生成新的文件名（增强版）
    
    Args:
        file_path: 文件路径
        llm: LLM模型
        api_key: API密钥
        is_local: 是否使用本地模型
        custom_prompt: 用户自定义的提示词，指导文件命名规则
        max_chars: 读取文件的最大字符数
        keep_extension: 是否保留原文件扩展名
        image_recognition_method: 图片文件的识别方法
        language: 语言选项（"Chinese" 或 "English"）
        online_model: 在线模型类型
        api_base: API基础URL
        model_name: 模型名称
        file_metadata: 文件元数据字典（包含 'modified', 'created' 等信息）
        
    Returns:
        str: 生成的新文件名（包含扩展名，如果keep_extension为True）
    """
    try:
        # 获取原始文件名和扩展名
        original_filename = os.path.basename(file_path)
        file_extension = Path(file_path).suffix.lower()
        file_name_without_ext = Path(file_path).stem
        
        # 检查文件名是否已经有奇怪的后缀
        if '_zpf' in file_name_without_ext or '(' in file_name_without_ext and ')' in file_name_without_ext:
            # 尝试清理文件名
            import re
            clean_name = re.sub(r'_zpf\(\d+\)$|_zpf$|\(\d+\)$', '', file_name_without_ext)
            if clean_name != file_name_without_ext:
                file_name_without_ext = clean_name
                logger.info(f"清理了文件名中的奇怪后缀: {original_filename} -> {file_name_without_ext}{file_extension}")
        
        # 🔧 预先识别和提取文件名中的结构化信息（日期、编号等）
        extracted_info = _extract_filename_structure(file_name_without_ext)
        logger.info(f"从文件名中提取的结构化信息: {extracted_info}")
        
        # 加载文档内容
        docs = load_document(file_path, max_chars=max_chars, api_key=api_key, is_local=is_local, image_recognition_method=image_recognition_method, online_model=online_model, api_base=api_base, model_name=model_name, moonshot_key=moonshot_key)  # 🚀 传递 moonshot_key

        if not docs:
            logger.info(f"警告：无法加载文件 {original_filename} 的内容，无法生成新文件名")
            return original_filename
        
        content = docs[0].page_content
        
        # 🚀 优化：对于 Office 文件，重命名时只使用前 1000 字符，节省更多 token
        # 文件名只需要理解文档的核心主题，不需要全部内容
        office_extensions = ['.ppt', '.pptx', '.doc', '.docx', '.xls', '.xlsx']
        if not is_local and file_extension in office_extensions:
            original_length = len(content)
            if original_length > 1000:
                content = content[:1000]
                logger.info(f"📊 重命名优化：Office 文件只使用前 1000 字符生成文件名（原内容: {original_length} 字符，节省 {original_length - 1000} 字符）")
        
        # 确保内容足够
        if len(content.strip()) < 10:
            logger.info(f"警告：文件 {original_filename} 的内容过少，无法生成有意义的文件名")
            return original_filename
        
        # 检查是否为图片文件
        supported_image_formats = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp']
        is_image_file = file_extension in supported_image_formats
        
        # 构建增强版系统提示词 - 针对图片和文档分别优化
        if language == "English":
            if is_image_file:
                system_prompt = """You are a professional image file naming expert. Your task is to generate a concise, descriptive filename based on the image content.

【Critical Rules for Images】:
1. MUST return only the filename without extension
2. NEVER include any dots (.)
3. Filename length: 5-25 characters
4. Use English words, avoid special characters
5. For images: focus on main objects, scenes, or text content
6. ALWAYS generate a new filename, NEVER return "No change" or similar
7. Make it descriptive and specific to the content
8. For screenshots: include the app/website name if identifiable
9. For photos: describe main subjects or scenes
10. For graphics: describe the design or purpose

【Examples for Images】:
- "company-logo-design" ✅
- "meeting-screenshot-zoom" ✅  
- "sunset-beach-photo" ✅
- "data-analysis-chart" ✅
- "invoice-receipt-march" ✅

Only return the pure filename, no explanations, no quotes, no extensions, no dots. Must generate a new, meaningful filename based on image content."""
            else:
                system_prompt = """You are a professional file naming expert. Your task is to generate a concise, descriptive filename based on the document content.

【Critical Rules for Documents】:
1. MUST return only the filename without extension
2. NEVER include any dots (.)
3. Filename length: 5-25 characters
4. Use English words, avoid special characters
5. Focus on document's main topic or purpose
6. ALWAYS generate a new filename, NEVER return "No change"
7. Make it professional and descriptive

【Examples for Documents】:
- "quarterly-financial-report" ✅
- "project-proposal-marketing" ✅
- "employee-handbook-2024" ✅

Only return the pure filename, no explanations, no quotes, no extensions, no dots. Must generate a new, meaningful filename based on document content."""
        else:
            if is_image_file:
                system_prompt = """你是专业的图片文件命名专家。根据图片内容生成简洁、描述性的文件名。

【图片文件重要规则】：
1. 只返回文件名主体，绝对不要包含扩展名
2. 不要包含任何点号（.）
3. 文件名长度：5-25个字符
4. 使用中文，避免特殊字符
5. 对于图片：重点描述主要物体、场景或文字内容
6. 必须生成新文件名，绝对不要返回"无需更改"等内容
7. 让文件名具有描述性和特定性
8. 截图类：如能识别应用/网站名称请包含
9. 照片类：描述主要主体或场景
10. 图形类：描述设计或用途

【图片正确示例】：
- "公司标志设计" ✅
- "会议截图-腾讯会议" ✅
- "日落海滩照片" ✅
- "数据分析图表" ✅
- "发票收据-三月" ✅

只返回纯文件名，不要任何解释，不要引号，不要扩展名，不要点号。必须基于图片内容生成一个新的、有意义的文件名。"""
            else:
                system_prompt = """你是专业的文件命名专家。根据文档内容生成简洁、描述性的文件名。

【文档重要规则】：
1. 只返回文件名主体，绝对不要包含扩展名
2. 不要包含任何点号（.）
3. 文件名长度：5-25个字符
4. 使用中文，避免特殊字符
5. 基于文档的主要主题或用途命名
6. 必须生成新文件名，绝对不要返回"无需更改"
7. 让文件名专业且具有描述性

【文档正确示例】：
- "季度财务报告" ✅
- "项目提案-市场营销" ✅
- "员工手册-2024" ✅

只返回纯文件名，不要任何解释，不要引号，不要扩展名，不要点号。必须基于文档内容生成一个新的、有意义的文件名。"""

        # 🎯 如果有自定义提示词，完全重构系统提示词，让用户要求成为第一优先级
        if custom_prompt:
            # 🔥 如果检测到日期，强制要求AI使用
            date_instruction = ""
            if extracted_info["has_date"] and extracted_info["date_prefix"]:
                if language == "English":
                    date_instruction = f"""
🚨 CRITICAL: DATE DETECTED IN ORIGINAL FILENAME 🚨
I have already extracted the date from the original filename for you:
  → Date: {extracted_info["date_prefix"]}
  → Format: {extracted_info.get("date_format", "unknown")}
  → Remaining text: {extracted_info["remaining_text"]}

YOU MUST USE THIS EXACT DATE: {extracted_info["date_prefix"]}
Do NOT ignore this date. Do NOT extract date from content.
If the user asks to "keep/preserve/retain date", you MUST use: {extracted_info["date_prefix"]}
"""
                else:
                    date_instruction = f"""
🚨 重要：已检测到原文件名中的日期 🚨
我已经从原文件名中为你提取了日期：
  → 日期：{extracted_info["date_prefix"]}
  → 格式：{extracted_info.get("date_format", "未知")}
  → 剩余文本：{extracted_info["remaining_text"]}

你必须使用这个精确的日期：{extracted_info["date_prefix"]}
不要忽略这个日期。不要从文档内容中提取日期。
如果用户要求"保留/保持日期"，你必须使用：{extracted_info["date_prefix"]}
"""
            
            if language == "English":
                # 完全重写系统提示词，用户要求放在最前面
                system_prompt = f"""🎯 PRIMARY MISSION - USER'S CUSTOM NAMING RULES:
The user has specified the following custom naming requirements. These requirements are ABSOLUTE and MUST be followed strictly:

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{custom_prompt}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{date_instruction}
📋 CRITICAL INSTRUCTIONS:
1. The original filename will be shown in the user message - carefully analyze it
2. If user asks to "keep/preserve/retain" parts (dates, prefixes, numbers):
   → Use the extracted information provided above
   → Maintain their EXACT format (including separators like - or _)
3. If user asks to "add" something (company, project name):
   → Extract from document content
   → Insert at the position specified by user

💡 EXAMPLE WORKFLOW:
Original: "2021-05-14_批复文件.pdf"
User rule: "keep date, add company name in middle"
Your process:
  Step 1: Use provided date → "2021-05-14"
  Step 2: Identify content → company is "蓝山环保公司"
  Step 3: Build name → "2021-05-14_蓝山环保公司_批复文件"

⚠️ BASIC TECHNICAL RULES (only when not conflicting with user's rules):
- Return ONLY the filename body (no extension, no dots)
- Avoid invalid characters: / \\ : * ? " < > |
- Length can be up to 50 characters when preserving information
- NEVER return "No change" - always generate a proper filename

Remember: USER'S REQUIREMENTS ARE ABSOLUTE. The technical rules are secondary."""
            else:
                # 中文版：完全重写，用户要求在最前面
                system_prompt = f"""🎯 首要任务 - 用户的自定义命名规则：
用户指定了以下自定义命名要求。这些要求是绝对的，必须严格遵守：

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{custom_prompt}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{date_instruction}
📋 关键执行指令：
1. 原始文件名会在用户消息中显示 - 仔细分析它
2. 如果用户要求"保留"某些部分(日期、前缀、编号等)：
   → 使用上面为你提取的信息
   → 保持完全相同的格式(包括分隔符如 - 或 _)
3. 如果用户要求"添加"某些内容(企业名称、项目名称等)：
   → 从文档内容中提取
   → 插入到用户指定的位置

💡 示例工作流程：
原文件名："2021-05-14_批复文件.pdf"
用户规则："保留日期，中间添加企业名称"
你的处理过程：
  步骤1：使用提供的日期 → "2021-05-14"
  步骤2：识别内容 → 企业是"蓝山环保公司"
  步骤3：构建文件名 → "2021-05-14_蓝山环保公司_批复文件"

⚠️ 基本技术规则（仅在不与用户要求冲突时适用）：
- 只返回文件名主体（不要扩展名、不要点号）
- 避免非法字符：/ \\ : * ? " < > |
- 保留信息时长度可达50个字符
- 绝不返回"无需更改" - 始终生成正确的文件名

记住：用户的要求是绝对的。技术规则是次要的。"""
        
        # 构建用户提示 - 根据是否有自定义提示词采用不同的格式
        if custom_prompt:
            # 有自定义提示词时，强调原始文件名和用户要求
            if language == "English":
                user_prompt = f"""📁 ORIGINAL FILENAME (analyze carefully):
{file_name_without_ext}

📄 DOCUMENT CONTENT:
{{content}}

🎯 YOUR TASK:
Based on the user's custom naming rules above, generate the new filename by:
1. Extracting required parts from the ORIGINAL FILENAME (dates, prefixes, etc.)
2. Extracting information from DOCUMENT CONTENT (company names, project names, etc.)
3. Combining them according to the user's specified format

Return only the new filename (no extension, no explanation)."""
            else:
                user_prompt = f"""📁 原始文件名（请仔细分析）：
{file_name_without_ext}

📄 文档内容：
{{content}}

🎯 你的任务：
根据上述用户的自定义命名规则，生成新文件名，步骤如下：
1. 从原始文件名中提取需要的部分（日期、前缀等）
2. 从文档内容中提取信息（企业名称、项目名称等）
3. 按照用户指定的格式组合它们

只返回新文件名（不要扩展名、不要解释）。"""
        else:
            # 没有自定义提示词时，使用标准格式
            if language == "English":
                if is_image_file:
                    user_prompt = f"""Original filename: {file_name_without_ext}

Analyze this image content and generate a descriptive filename:

{{content}}

Generate a professional filename that describes the main content, objects, or purpose of this image. Focus on what makes this image unique and identifiable."""
                else:
                    user_prompt = f"""Original filename: {file_name_without_ext}

Analyze this document content and generate a descriptive filename:

{{content}}

Generate a professional filename that captures the main topic and purpose of this document."""
            else:
                if is_image_file:
                    user_prompt = f"""原始文件名：{file_name_without_ext}

分析这个图片内容并生成描述性文件名：

{{content}}

生成一个专业的文件名，描述这个图片的主要内容、物体或用途。重点关注让这个图片独特且易于识别的特征。"""
                else:
                    user_prompt = f"""原始文件名：{file_name_without_ext}

分析这个文档内容并生成描述性文件名：

{{content}}

生成一个专业的文件名，概括这个文档的主要主题和用途。"""
        
        # 创建提示模板
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("user", user_prompt)
        ])
        
        # 检查是否为Ollama模型
        model_type = llm.__class__.__name__
        is_ollama = model_type == "Ollama"

        # 调用LLM生成文件名，使用429错误重试机制
        def call_rename_model():
            chain = prompt | llm
            # 准备所有可能需要的变量
            template_vars = {"content": content}
            
            # 添加文件类型变量支持，以防用户在custom_prompt中使用了{文件类型}
            if file_extension:
                file_type = file_extension.lstrip('.')  # 移除点号
                template_vars["文件类型"] = file_type
                template_vars["file_type"] = file_type  # 英文版本
            
            return chain.invoke(template_vars)
        
        result = retry_on_rate_limit(call_rename_model, max_retries=3, base_delay=2)
        print(content)

        # 获取生成的文件名并清理
        raw_response = result.content.strip()
        print("原始响应:")
        print(raw_response)
        
        # 清理响应内容，移除思考过程
        new_filename = clean_llm_response(raw_response)
        print("清理后的文件名:")
        print(new_filename)
        
        # 检查返回是否包含错误提示或请求
        if language == "English":
            problematic_phrases = ["please provide", "cannot generate", "need more", "cannot determine", "unclear", "No change", "no change", "keep original", "maintain original", "original name"]
        else:
            problematic_phrases = ["请提供", "无法生成", "需要更多", "无法确定", "不清楚", "No change", "no change", "无需更改", "不需要更改", "保持原名"]
        if any(phrase in new_filename for phrase in problematic_phrases):
            logger.info(f"生成的文件名包含问题短语: {new_filename}，尝试重新生成")
            
            # 如果是"No change"类的回复，尝试用更强制的提示词重新生成
            if any(phrase in new_filename.lower() for phrase in ["no change", "无需更改", "不需要更改", "保持原名"]):
                logger.info("检测到AI返回'No change'，使用强制重命名提示词重新生成")
                
                # 使用更强制的提示词
                if language == "English":
                    force_prompt = ChatPromptTemplate.from_messages([
                        ("system", """You must generate a completely new filename for the file. Do not say "No change" or similar.
                        
Please generate a 5-20 character English filename based on the core theme of the file content.
The filename should be concise, professional, and reflect the main content of the file.
Absolutely do not include extensions, do not include dots.
You must return a new filename, cannot refuse."""),
                        ("user", "File content: {content}\n\nPlease generate a new filename for this file (no extension):")
                    ])
                else:
                    force_prompt = ChatPromptTemplate.from_messages([
                        ("system", """你必须为文件生成一个全新的文件名。不允许说"No change"或类似的话。
                        
请根据文件内容的核心主题，生成一个5-20个字符的中文文件名。
文件名要简洁、专业，体现文件的主要内容。
绝对不要包含扩展名，不要包含点号。
必须返回一个新的文件名，不能拒绝。"""),
                        ("user", "文件内容：{content}\n\n请为这个文件生成一个新的文件名（不要扩展名）：")
                    ])
                
                try:
                    # 使用429重试机制包装强制重新生成的调用
                    def call_force_model():
                        force_chain = force_prompt | llm
                        # 准备所有可能需要的变量
                        force_template_vars = {"content": content[:500]}  # 使用前500字符
                        
                        # 添加文件类型变量支持
                        if file_extension:
                            file_type = file_extension.lstrip('.')  # 移除点号
                            force_template_vars["文件类型"] = file_type
                            force_template_vars["file_type"] = file_type  # 英文版本
                        
                        return force_chain.invoke(force_template_vars)
                    
                    force_result = retry_on_rate_limit(call_force_model, max_retries=3, base_delay=2)
                    force_filename = clean_llm_response(force_result.content)
                    
                    # 清理强制生成的文件名
                    if '.' in force_filename:
                        force_filename = force_filename.split('.')[0]
                    
                    force_filename = force_filename.strip()
                    
                    # 如果强制生成的文件名有效，使用它
                    if force_filename and len(force_filename) > 2 and not any(phrase in force_filename.lower() for phrase in ["no change", "无需更改"]):
                        logger.info(f"强制重新生成的文件名: {force_filename}")
                        new_filename = force_filename
                    else:
                        # 如果还是不行，基于内容关键词生成
                        logger.info("强制重新生成也失败，基于内容关键词生成文件名")
                        # 提取内容中的关键词作为文件名
                        content_words = content[:100].replace('\n', ' ').split()
                        if content_words:
                            new_filename = ''.join(content_words[:3])[:15]  # 取前3个词，最多15个字符
                        else:
                            if language == "English":
                                new_filename = f"Document_{file_name_without_ext[:10]}"
                            else:
                                new_filename = f"文档_{file_name_without_ext[:10]}"
                except Exception as e:
                    error_msg = str(e).lower()
                    is_rate_limit_error = (
                        "429" in error_msg or 
                        "too many requests" in error_msg or 
                        "rate limit" in error_msg
                    )
                    
                    if is_rate_limit_error:
                        logger.error(f"强制重新生成文件名时遇到速率限制，重试失败: {str(e)}")
                    else:
                        logger.error(f"强制重新生成文件名失败: {str(e)}")
                    
                    # 基于原文件名生成一个变体
                    if language == "English":
                        new_filename = f"Renamed_{file_name_without_ext[:15]}"
                    else:
                        new_filename = f"重命名_{file_name_without_ext[:15]}"
            else:
                # 对于其他问题短语，使用简化后的原始文件名
                new_filename = file_name_without_ext
        
        # 【最激进的清理方法】：直接移除第一个点号及其后面的所有内容
        cleaned_name = new_filename.strip()
        
        # 如果包含点号，直接截断到第一个点号之前
        if '.' in cleaned_name:
            cleaned_name = cleaned_name.split('.')[0]
            logger.info(f"移除了点号及其后面的所有内容: {new_filename} -> {cleaned_name}")
        
        # 移除可能的空格和特殊字符
        cleaned_name = cleaned_name.strip()
        
        # 如果清理后名称为空，使用原始文件名（不含扩展名）
        if not cleaned_name:
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            if not base_name:
                base_name = "Renamed_File" if language == "English" else "重命名文件"
            cleaned_name = base_name
            logger.info(f"清理后文件名为空，使用原始文件名: {cleaned_name}")
        
        # 清理文件名，移除Windows不允许的字符
        invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '，', '。', '！', '？', '；', '：', ''', ''', '"', '"', '【', '】', '、']
        for char in invalid_chars:
            cleaned_name = cleaned_name.replace(char, '_')
        
        # 将连续的下划线替换为单个下划线
        import re
        cleaned_name = re.sub(r'_+', '_', cleaned_name)
        
        # 移除开头和结尾的下划线
        cleaned_name = cleaned_name.strip('_')
        
        # 限制长度
        if len(cleaned_name) > 50:
            cleaned_name = cleaned_name[:50]
        
        # 最终检查：确保文件名不为空
        if not cleaned_name:
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            if not base_name:
                base_name = "Renamed_File" if language == "English" else "重命名文件"
            cleaned_name = base_name
        
        # 🔧 【日期后处理逻辑】如果用户在提示词中要求添加日期，但AI没有添加，则手动添加
        if file_metadata and custom_prompt:
            # 检查用户是否要求添加日期
            date_keywords_cn = ['日期', '时间', '修改日期', '创建日期']
            date_keywords_en = ['date', 'time', 'modification date', 'creation date', 'modified', 'created']
            
            requires_date = False
            if language == "Chinese":
                requires_date = any(keyword in custom_prompt for keyword in date_keywords_cn)
            else:
                requires_date = any(keyword.lower() in custom_prompt.lower() for keyword in date_keywords_en)
            
            # 如果要求日期，检查AI是否已经添加了日期
            if requires_date:
                modified_time = file_metadata.get('modified', '')
                mod_date_only = modified_time.split(' ')[0] if modified_time else ''
                
                if mod_date_only:
                    # 检查文件名中是否已包含日期
                    has_date_in_name = False
                    date_formats = [
                        mod_date_only,  # 2024-12-08
                        mod_date_only.replace('-', ''),  # 20241208
                        mod_date_only.replace('-', '/'),  # 2024/12/08
                    ]
                    for date_fmt in date_formats:
                        if date_fmt in cleaned_name:
                            has_date_in_name = True
                            break
                    
                    # 如果AI没有添加日期，手动添加
                    if not has_date_in_name:
                        cleaned_name = f"{mod_date_only}+{cleaned_name}"
                        logger.info(f"🔧 AI未添加日期，手动添加日期前缀: {mod_date_only}")
        
        # 添加原始扩展名
        if keep_extension:
            final_filename = cleaned_name + file_extension
        else:
            final_filename = cleaned_name
        
        logger.info(f"为文件 {original_filename} 生成新文件名: {final_filename}")
        return final_filename
        
    except Exception as e:
        logger.error(f"生成文件名时出错: {str(e)}")
        # 如果出错，返回原始文件名
        return os.path.basename(file_path)


def merge_similar_categories(organized_files: dict, similarity_threshold: float = 0.8, llm: ChatOpenAI = None) -> dict:
    """合并相似的类别，减少过度分类的问题
    
    Args:
        organized_files: 分类结果字典
        similarity_threshold: 相似度阈值，高于此值的类别将被合并
        llm: LLM模型，用于判断类别相似性
        
    Returns:
        dict: 合并后的文件结构字典
    """
    if not organized_files or "categories" not in organized_files or not organized_files["categories"]:
        return organized_files
    
    # 获取所有类别名称
    categories = list(organized_files["categories"].keys())
    if len(categories) <= 1:
        return organized_files  # 如果只有一个类别，不需要合并
    
    # 如果没有提供LLM模型，使用简单的字符串匹配方式合并类别
    if llm is None:
        return simple_merge_categories(organized_files)
    
    # 使用LLM模型判断类别相似性并合并
    merged_files = {
        "categories": {},
        "unclassified": organized_files.get("unclassified", [])
    }
    
    # 构建类别合并映射
    category_mapping = {}
    processed_categories = set()
    
    # 分批处理类别比较
    batch_size = 20  # 每批处理的类别数量
    total_batches = (len(categories) + batch_size - 1) // batch_size
    
    for i in range(0, len(categories), batch_size):
        batch_categories = categories[i:i + batch_size]
        logger.info(f"处理第 {i//batch_size + 1}/{total_batches} 批类别比较")
        
        for cat1 in batch_categories:
            if cat1 in processed_categories:
                continue
            
            # 如果该类别已经被映射，使用映射后的类别
            target_category = category_mapping.get(cat1, cat1)
            
            # 只与未处理的类别比较
            remaining_categories = [cat for cat in categories if cat not in processed_categories and cat != cat1]
            
            # 分批比较相似性
            for j in range(0, len(remaining_categories), batch_size):
                compare_batch = remaining_categories[j:j + batch_size]
                
                try:
                    # 批量比较相似性
                    for cat2 in compare_batch:
                        if cat2 in processed_categories:
                            continue
                        
                        # 判断两个类别是否相似
                        if are_categories_similar(cat1, cat2, llm, similarity_threshold):
                            # 合并到目标类别
                            category_mapping[cat2] = target_category
                            processed_categories.add(cat2)
                    
                    # 每批处理完后清理内存
                    import gc
                    gc.collect()
                    
                except Exception as e:
                    logger.error(f"比较类别相似性时出错: {str(e)}")
                    continue
            
            processed_categories.add(cat1)
    
    # 应用合并映射
    for category, data in organized_files["categories"].items():
        target_category = category_mapping.get(category, category)
        
        # 确保目标类别存在
        if target_category not in merged_files["categories"]:
            merged_files["categories"][target_category] = {
                "subcategories": {}
            }
        
        # 复制子类别
        for subcategory, subdata in data["subcategories"].items():
            if subcategory not in merged_files["categories"][target_category]["subcategories"]:
                merged_files["categories"][target_category]["subcategories"][subcategory] = {
                    "files": []
                }
            
            # 添加文件
            merged_files["categories"][target_category]["subcategories"][subcategory]["files"].extend(subdata["files"])
    
    return merged_files


def intelligent_aggregate_categories(organized_files: dict, llm: ChatOpenAI = None, max_categories: int = 8, language: str = "Chinese") -> dict:
    """智能聚合分类结果，将细分的类别聚合到更概括的分类下
    
    Args:
        organized_files: 分类结果字典
        llm: LLM模型，用于语义分析和生成概括性名称
        max_categories: 最大类别数量，超过此数量将进行聚合
        
    Returns:
        dict: 聚合后的文件结构字典
    """
    if not organized_files or "categories" not in organized_files or not organized_files["categories"]:
        return organized_files
    
    categories = list(organized_files["categories"].keys())
    if len(categories) <= max_categories:
        logger.info(f"当前类别数量({len(categories)})未超过最大限制({max_categories})，无需聚合")
        return organized_files
    
    if llm is None:
        logger.warning("未提供LLM模型，使用简单聚合方式")
        return simple_aggregate_categories(organized_files, max_categories, language)
    
    logger.info(f"开始智能聚合，当前类别数量: {len(categories)}，目标: {max_categories}")
    
    try:
        # 分批处理类别，每批最多处理50个类别
        batch_size = 50
        category_batches = [categories[i:i + batch_size] for i in range(0, len(categories), batch_size)]
        
        # 存储所有批次的聚合结果
        all_aggregation_plans = {}
        
        for batch_index, batch_categories in enumerate(category_batches):
            logger.info(f"处理第 {batch_index + 1}/{len(category_batches)} 批类别")
            
            # 使用LLM分析类别并进行聚合
            categories_text = ", ".join(batch_categories)
            
            if language == "English":
                prompt = f"""
Please analyze the following file classification list and aggregate them into broader main categories.

CRITICAL REQUIREMENT: All aggregated category names MUST be in English, regardless of the original category language.

Current classification list:
{categories_text}

Please aggregate according to the following requirements:
1. Aggregate semantically similar or related categories under the same main category
2. Give each aggregated main category a comprehensive name IN ENGLISH ONLY
3. Category names should be concise and clear, able to summarize the characteristics of all subcategories
4. Even if original categories are in Chinese (like "简历", "教育"), use English names (like "Resume", "Education")

Please return the aggregation plan in JSON format as follows:
{{
    "aggregation_plan": {{
        "Resume": ["简历", "个人简历", ...],
        "Education": ["教育行业简历", "教育文档", ...],
        ...
    }}
}}

Only return JSON with English category names, no other explanations.
"""
            else:
                prompt = f"""
请分析以下文件分类列表，将它们聚合成更概括的主要类别。

当前分类列表：
{categories_text}

请按照以下要求进行聚合：
1. 将语义相似或相关的类别聚合到同一个主类别下
2. 为每个聚合后的主类别起一个概括性的名称
3. 类别名称要简洁明了，能够概括其下所有子类别的特征

请以JSON格式返回聚合方案，格式如下：
{{
    "aggregation_plan": {{
        "概括类别名1": ["原类别1", "原类别2", ...],
        "概括类别名2": ["原类别3", "原类别4", ...],
        ...
    }}
}}

只返回JSON，不要其他解释。
"""
            
            try:
                # 调用LLM获取聚合方案
                response = llm.invoke(prompt)
                response_text = response.content.strip()
                
                # 尝试解析JSON响应
                import json
                import re
                
                # 提取JSON部分
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                if json_match:
                    json_str = json_match.group()
                    try:
                        aggregation_data = json.loads(json_str)
                        if "aggregation_plan" in aggregation_data:
                            # 合并当前批次的聚合方案
                            all_aggregation_plans.update(aggregation_data["aggregation_plan"])
                        else:
                            logger.warning(f"第 {batch_index + 1} 批LLM响应格式不正确，缺少aggregation_plan字段")
                    except json.JSONDecodeError as e:
                        logger.warning(f"第 {batch_index + 1} 批解析LLM响应JSON失败: {e}")
                else:
                    logger.warning(f"第 {batch_index + 1} 批未能从LLM响应中提取JSON格式")
                
            except Exception as batch_error:
                logger.error(f"处理第 {batch_index + 1} 批类别时出错: {str(batch_error)}")
                continue
        
        # 如果成功获取了聚合方案
        if all_aggregation_plans:
            # 应用聚合方案
            return apply_aggregation_plan(organized_files, all_aggregation_plans)
        else:
            logger.warning("未能获取有效的聚合方案，使用简单聚合方式")
            return simple_aggregate_categories(organized_files, max_categories, language)
            
    except Exception as e:
        logger.error(f"智能聚合过程中出错: {str(e)}")
        return simple_aggregate_categories(organized_files, max_categories, language)


def simple_aggregate_categories(organized_files: dict, max_categories: int = 8, language: str = "Chinese") -> dict:
    """简单的类别聚合方式，基于关键词匹配
    
    Args:
        organized_files: 分类结果字典
        max_categories: 最大类别数量
        
    Returns:
        dict: 聚合后的文件结构字典
    """
    if not organized_files or "categories" not in organized_files:
        return organized_files
    
    categories = list(organized_files["categories"].keys())
    if len(categories) <= max_categories:
        return organized_files
    
    # 定义常见的聚合规则
    if language == "English":
        aggregation_rules = {
            "Documents": ["document", "material", "report", "manual", "guide", "tutorial", "instruction"],
            "Media Files": ["image", "photo", "picture", "screenshot", "media", "video", "audio"],
            "Office Files": ["spreadsheet", "presentation", "slide", "PPT", "Excel", "Word", "office"],
            "Technical Documentation": ["code", "program", "technical", "development", "API", "documentation"],
            "Financial": ["financial", "accounting", "invoice", "bill", "receipt", "expense"],
            "Contracts": ["contract", "agreement", "terms", "legal", "accord"],
            "Personal Files": ["personal", "resume", "certificate", "identity", "private"],
            "Other Files": []  # 默认分类
        }
    else:
        aggregation_rules = {
            "文档资料": ["文档", "资料", "报告", "说明", "手册", "指南", "教程"],
            "图片媒体": ["图片", "照片", "图像", "截图", "媒体", "视频", "音频"],
            "办公文件": ["表格", "演示", "文稿", "PPT", "Excel", "Word", "办公"],
            "技术文档": ["代码", "程序", "技术", "开发", "API", "文档"],
            "财务相关": ["财务", "会计", "发票", "账单", "收据", "报销"],
            "合同协议": ["合同", "协议", "条款", "法律", "协定"],
            "个人文件": ["个人", "简历", "证件", "身份", "私人"],
            "其他文件": []  # 默认分类
        }
    
    # 创建聚合映射
    category_mapping = {}
    
    for category in categories:
        mapped = False
        category_lower = category.lower()
        
        # 尝试匹配聚合规则
        other_files_key = "Other Files" if language == "English" else "其他文件"
        for aggregate_name, keywords in aggregation_rules.items():
            if aggregate_name == other_files_key:
                continue
            
            for keyword in keywords:
                if keyword.lower() in category_lower:
                    category_mapping[category] = aggregate_name
                    mapped = True
                    break
            
            if mapped:
                break
        
        # 如果没有匹配到任何规则，归类到"其他文件"
        if not mapped:
            category_mapping[category] = other_files_key
    
    # 应用聚合映射
    return apply_aggregation_mapping(organized_files, category_mapping)


def apply_aggregation_plan(organized_files: dict, aggregation_plan: dict) -> dict:
    """应用聚合方案到文件结构
    
    Args:
        organized_files: 原始分类结果
        aggregation_plan: 聚合方案，格式为 {"新类别名": ["原类别1", "原类别2", ...]}
        
    Returns:
        dict: 聚合后的文件结构
    """
    # 创建类别映射
    category_mapping = {}
    for new_category, old_categories in aggregation_plan.items():
        for old_category in old_categories:
            category_mapping[old_category] = new_category
    
    return apply_aggregation_mapping(organized_files, category_mapping)


def apply_aggregation_mapping(organized_files: dict, category_mapping: dict) -> dict:
    """应用类别映射到文件结构
    
    Args:
        organized_files: 原始分类结果
        category_mapping: 类别映射，格式为 {"原类别": "新类别"}
        
    Returns:
        dict: 聚合后的文件结构
    """
    aggregated_files = {
        "categories": {},
        "unclassified": organized_files.get("unclassified", [])
    }
    
    # 应用聚合映射
    for old_category, data in organized_files["categories"].items():
        new_category = category_mapping.get(old_category, old_category)
        
        # 确保新类别存在
        if new_category not in aggregated_files["categories"]:
            aggregated_files["categories"][new_category] = {
                "subcategories": {}
            }
        
        # 处理子类别
        for subcategory, subdata in data["subcategories"].items():
            # 如果原类别被聚合了，将原类别名作为新的子类别
            if old_category != new_category:
                # 使用原类别名作为子类别名
                final_subcategory = old_category
            else:
                final_subcategory = subcategory
            
            # 确保子类别存在
            if final_subcategory not in aggregated_files["categories"][new_category]["subcategories"]:
                aggregated_files["categories"][new_category]["subcategories"][final_subcategory] = {
                    "files": []
                }
            
            # 添加文件
            aggregated_files["categories"][new_category]["subcategories"][final_subcategory]["files"].extend(subdata["files"])
    
    logger.info(f"聚合完成，类别数量从 {len(organized_files['categories'])} 减少到 {len(aggregated_files['categories'])}")
    
    return aggregated_files


def simple_merge_categories(organized_files: dict) -> dict:
    """使用简单的字符串匹配方式合并类别
    
    Args:
        organized_files: 分类结果字典
        
    Returns:
        dict: 合并后的文件结构字典
    """
    if not organized_files or "categories" not in organized_files:
        return organized_files
    
    # 获取所有类别名称
    categories = list(organized_files["categories"].keys())
    if len(categories) <= 1:
        return organized_files
    
    # 创建合并后的结构
    merged_files = {
        "categories": {},
        "unclassified": organized_files.get("unclassified", [])
    }
    
    # 用于跟踪已处理的类别
    processed_categories = set()
    
    # 简单的相似性检查：忽略大小写和特殊字符的相似性
    def normalize_category(cat):
        return ''.join(char.lower() for char in cat if char.isalnum())
    
    # 按名称长度排序类别，优先使用较短的名称
    categories.sort(key=len)
    
    # 构建类别合并映射
    category_mapping = {}
    
    for i, cat1 in enumerate(categories):
        if cat1 in processed_categories:
            continue
        
        norm_cat1 = normalize_category(cat1)
        
        for cat2 in categories[i+1:]:
            if cat2 in processed_categories:
                continue
            
            norm_cat2 = normalize_category(cat2)
            
            # 简单的相似度检查
            # 1. 一个是另一个的子字符串
            # 2. 编辑距离较小
            if (norm_cat1 in norm_cat2 or norm_cat2 in norm_cat1 or 
                (len(norm_cat1) > 3 and len(norm_cat2) > 3 and 
                 (norm_cat1[:3] == norm_cat2[:3] or levenshtein_distance(norm_cat1, norm_cat2) <= 3))):
                category_mapping[cat2] = cat1
                processed_categories.add(cat2)
        
        processed_categories.add(cat1)
    
    # 应用合并映射
    for category, data in organized_files["categories"].items():
        target_category = category_mapping.get(category, category)
        
        # 确保目标类别存在
        if target_category not in merged_files["categories"]:
            merged_files["categories"][target_category] = {
                "subcategories": {}
            }
        
        # 复制子类别
        for subcategory, subdata in data["subcategories"].items():
            if subcategory not in merged_files["categories"][target_category]["subcategories"]:
                merged_files["categories"][target_category]["subcategories"][subcategory] = {
                    "files": []
                }
            
            # 添加文件
            merged_files["categories"][target_category]["subcategories"][subcategory]["files"].extend(subdata["files"])
    
    return merged_files


def levenshtein_distance(s1, s2):
    """计算两个字符串的编辑距离"""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    
    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            # 计算插入、删除和替换的距离
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]


def are_categories_similar(cat1: str, cat2: str, llm: ChatOpenAI, threshold: float = 0.8) -> bool:
    """使用LLM判断两个类别是否相似
    
    Args:
        cat1: 第一个类别名称
        cat2: 第二个类别名称
        llm: LLM模型
        threshold: 相似度阈值
        
    Returns:
        bool: 两个类别是否相似
    """
    # 使用缓存来存储已计算过的相似度结果
    if not hasattr(are_categories_similar, 'similarity_cache'):
        are_categories_similar.similarity_cache = {}
    
    # 生成缓存键
    cache_key = tuple(sorted([cat1.lower(), cat2.lower()]))
    
    # 检查缓存
    if cache_key in are_categories_similar.similarity_cache:
        return are_categories_similar.similarity_cache[cache_key] >= threshold
    
    # 如果两个类别完全相同，直接返回True
    if cat1.lower() == cat2.lower():
        are_categories_similar.similarity_cache[cache_key] = 1.0
        return True
    
    # 如果一个类别是另一个的子字符串，可能是相似的类别
    if cat1.lower() in cat2.lower() or cat2.lower() in cat1.lower():
        are_categories_similar.similarity_cache[cache_key] = 0.9
        return True
    
    try:
        # 创建提示模板
        prompt = ChatPromptTemplate.from_messages([
            ("system", """你是一个专业的文档分类专家。请判断两个类别名称是否表示相似的概念，可以合并为同一类别。
            
只返回一个数字，表示相似度，范围从0到1：
- 1表示完全相同或极其相似，可以合并
- 0表示完全不同，无法合并
- 中间值表示部分相似

不要解释原因，只返回一个数字。"""),
            ("user", f"类别1: {{cat1}}\n类别2: {{cat2}}\n\n这两个类别的相似度是多少？只返回一个0到1之间的小数。")
        ])
        
        # 调用LLM
        chain = prompt | llm
        result = chain.invoke({
            "cat1": cat1,
            "cat2": cat2
        })
        
        # 尝试解析响应中的数字
        response_text = result.content.strip()
        
        # 提取数字
        import re
        numbers = re.findall(r"0\.\d+|\d+\.\d+|\d+", response_text)
        if numbers:
            similarity = float(numbers[0])
            # 确保相似度在0-1范围内
            similarity = max(0.0, min(1.0, similarity))
            # 存储到缓存
            are_categories_similar.similarity_cache[cache_key] = similarity
            return similarity >= threshold
        else:
            logger.warning(f"无法从LLM响应中提取相似度值: {response_text}")
            # 退回到简单的字符串比较
            result = are_categories_similar_simple(cat1, cat2)
            are_categories_similar.similarity_cache[cache_key] = 1.0 if result else 0.0
            return result
            
    except Exception as e:
        logger.error(f"使用LLM判断类别相似性时出错: {str(e)}")
        # 退回到简单的字符串比较
        result = are_categories_similar_simple(cat1, cat2)
        are_categories_similar.similarity_cache[cache_key] = 1.0 if result else 0.0
        return result


def are_categories_similar_simple(cat1: str, cat2: str) -> bool:
    """简单判断两个类别是否相似
    
    Args:
        cat1: 第一个类别名称
        cat2: 第二个类别名称
        
    Returns:
        bool: 两个类别是否相似
    """
    # 归一化类别名称
    norm_cat1 = ''.join(char.lower() for char in cat1 if char.isalnum())
    norm_cat2 = ''.join(char.lower() for char in cat2 if char.isalnum())
    
    # 如果一个是另一个的子字符串，认为是相似的
    if norm_cat1 in norm_cat2 or norm_cat2 in norm_cat1:
        return True
    
    # 如果前缀相同且长度足够，也认为是相似的
    prefix_length = min(3, min(len(norm_cat1), len(norm_cat2)))
    if prefix_length > 0 and norm_cat1[:prefix_length] == norm_cat2[:prefix_length]:
        return True
    
    # 编辑距离小于等于阈值，认为是相似的
    if len(norm_cat1) > 0 and len(norm_cat2) > 0:
        distance = levenshtein_distance(norm_cat1, norm_cat2)
        max_length = max(len(norm_cat1), len(norm_cat2))
        normalized_distance = distance / max_length
        return normalized_distance <= 0.3  # 归一化编辑距离阈值
    
    return False


def test_reference_items_functionality():
    """测试参考项功能的简单测试函数"""
    
    # 测试数据
    test_content = "这是一个关于项目管理的文档，包含了项目计划和执行细节。"
    test_filename = "项目管理计划-2023年度"
    
    # 测试用例1：仅使用文件名
    reference_items_filename_only = {
        'content': False,
        'filename': True,
        'type': False,
        'size': False,
        'date': False,
        'custom_prompt': ''
    }
    
    # 测试用例2：同时使用文件名和内容
    reference_items_both = {
        'content': True,
        'filename': True,
        'type': False,
        'size': False,
        'date': False,
        'custom_prompt': ''
    }
    
    # 测试用例3：仅使用内容（传统方式）
    reference_items_content_only = {
        'content': True,
        'filename': False,
        'type': False,
        'size': False,
        'date': False,
        'custom_prompt': ''
    }
    
    print("参考项功能测试")
    print("=" * 50)
    
    print("\n测试用例1: 仅使用文件名")
    print(f"文件名: {test_filename}")
    print(f"参考项设置: {reference_items_filename_only}")
    
    # 构建分析内容（模拟 classify_document 中的逻辑）
    analysis_content = ""
    use_content = reference_items_filename_only.get('content', True)
    use_filename = reference_items_filename_only.get('filename', False)
    
    if use_filename and test_filename:
        analysis_content += f"文件名：{test_filename}\n\n"
    
    if use_content and test_content:
        analysis_content += test_content
    elif not use_content and use_filename and test_filename:
        analysis_content = f"文件名：{test_filename}"
    elif not analysis_content:
        analysis_content = test_content
        
    print(f"生成的分析内容: {analysis_content}")
    
    print("\n测试用例2: 同时使用文件名和内容")
    print(f"文件名: {test_filename}")
    print(f"文档内容: {test_content}")
    print(f"参考项设置: {reference_items_both}")
    
    # 构建分析内容
    analysis_content = ""
    use_content = reference_items_both.get('content', True)
    use_filename = reference_items_both.get('filename', False)
    
    if use_filename and test_filename:
        analysis_content += f"文件名：{test_filename}\n\n"
    
    if use_content and test_content:
        analysis_content += test_content
    elif not use_content and use_filename and test_filename:
        analysis_content = f"文件名：{test_filename}"
    elif not analysis_content:
        analysis_content = test_content
        
    print(f"生成的分析内容: {analysis_content}")
    
    print("\n测试用例3: 仅使用内容（传统方式）")
    print(f"文档内容: {test_content}")
    print(f"参考项设置: {reference_items_content_only}")
    
    # 构建分析内容
    analysis_content = ""
    use_content = reference_items_content_only.get('content', True)
    use_filename = reference_items_content_only.get('filename', False)
    
    if use_filename and test_filename:
        analysis_content += f"文件名：{test_filename}\n\n"
    
    if use_content and test_content:
        analysis_content += test_content
    elif not use_content and use_filename and test_filename:
        analysis_content = f"文件名：{test_filename}"
    elif not analysis_content:
        analysis_content = test_content
        
    print(f"生成的分析内容: {analysis_content}")
    
    print("\n" + "=" * 50)
    print("测试完成！")


def check_network_connectivity(api_base: str = None, timeout: int = 10) -> bool:
    """检查网络连接状态
    
    Args:
        api_base: API基础URL，如果提供则测试该URL的连通性
        timeout: 超时时间（秒）
        
    Returns:
        bool: 网络是否可用
    """
    import urllib.request
    import socket
    
    try:
        # 如果提供了API基础URL，测试该URL
        if api_base:
            try:
                # 从URL中提取主机名
                from urllib.parse import urlparse
                parsed_url = urlparse(api_base)
                host = parsed_url.netloc
                
                # 测试DNS解析和连接
                socket.setdefaulttimeout(timeout)
                socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, 443 if parsed_url.scheme == 'https' else 80))
                logger.info(f"API服务器连接正常: {host}")
                return True
            except Exception as e:
                logger.warning(f"API服务器连接失败: {host}, 错误: {str(e)}")
                return False
        else:
            # 测试通用网络连接
            urllib.request.urlopen('https://www.google.com', timeout=timeout)
            logger.info("网络连接正常")
            return True
    except Exception as e:
        logger.warning(f"网络连接检查失败: {str(e)}")
        return False


def create_robust_llm(llm_type: LLMType = "openai",
                     model_name: str = "gpt-3.5-turbo",
                     api_base: Optional[str] = None,
                     api_key: Optional[str] = None,
                     temperature: float = 0,
                     max_retries: int = 3,
                     timeout: int = 120) -> ChatOpenAI:
    """创建具有增强容错能力的LLM实例
    
    Args:
        llm_type: LLM类型
        model_name: 模型名称
        api_base: API基础URL
        api_key: API密钥
        temperature: 温度参数
        max_retries: 最大重试次数
        timeout: 超时时间
        
    Returns:
        LLM实例
    """
    # 检查网络连接
    if api_base and not check_network_connectivity(api_base):
        logger.warning("API服务器连接异常，可能会影响处理效果")
    
    # 创建LLM实例
    llm = create_llm(llm_type, model_name, api_base, api_key, temperature)
    
    # 设置增强的超时和重试参数
    if hasattr(llm, 'request_timeout'):
        llm.request_timeout = timeout
    if hasattr(llm, 'max_retries'):
        llm.max_retries = max_retries
    
    return llm


def safe_file_operation(operation_func, *args, max_retries: int = 3, **kwargs):
    """安全的文件操作包装器，带重试机制
    
    Args:
        operation_func: 要执行的文件操作函数
        max_retries: 最大重试次数
        *args, **kwargs: 传递给操作函数的参数
        
    Returns:
        操作结果，如果失败则返回None
    """
    for attempt in range(max_retries):
        try:
            return operation_func(*args, **kwargs)
        except (PermissionError, FileNotFoundError, OSError) as e:
            logger.warning(f"文件操作失败 (尝试 {attempt + 1}/{max_retries}): {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(0.5 * (attempt + 1))  # 递增延迟
            else:
                logger.error(f"文件操作最终失败: {str(e)}")
                return None
        except Exception as e:
            logger.error(f"文件操作出现未预期错误: {str(e)}")
            return None


def rename_single_file_thread_safe(file_path: str, llm: ChatOpenAI, api_key: str, is_local: bool, 
                                  custom_prompt: str = "", max_chars: int = 5000, 
                                  keep_extension: bool = True,
                                  image_recognition_method: ImageRecognitionMethod = "tesseract",
                                  progress_callback: Optional[Callable] = None,
                                  file_index: int = 0, total_files: int = 0, language: str = "Chinese", 
                                  online_model: str = None, api_base: str = None, model_name: str = None) -> dict:
    """线程安全的单个文件重命名函数
    
    Args:
        file_path: 文件路径
        llm: LLM模型
        api_key: API密钥
        is_local: 是否使用本地模型
        custom_prompt: 自定义提示词
        max_chars: 最大字符数
        keep_extension: 是否保留扩展名
        image_recognition_method: 图片识别方法
        progress_callback: 进度回调函数
        file_index: 文件索引
        total_files: 总文件数
        
    Returns:
        dict: 包含原始路径、新文件名和状态的字典
    """
    try:
        original_filename = os.path.basename(file_path)
        logger.info(f"开始重命名文件 ({file_index + 1}/{total_files}): {original_filename}")
        
        # 调用原始的重命名函数
        new_filename = rename_file_by_content(
            file_path=file_path,
            llm=llm,
            api_key=api_key,
            is_local=is_local,
            custom_prompt=custom_prompt,
            max_chars=max_chars,
            keep_extension=keep_extension,
            image_recognition_method=image_recognition_method,
            language=language,
            online_model=online_model,
            api_base=api_base,
            model_name=model_name
        )
        
        # 更新进度
        if progress_callback:
            progress = int((file_index + 1) / total_files * 100)
            progress_callback(progress)
        
        result = {
            "original_path": file_path,
            "original_filename": original_filename,
            "new_filename": new_filename,
            "status": "success",
            "error": None
        }
        
        logger.info(f"文件重命名成功 ({file_index + 1}/{total_files}): {original_filename} -> {new_filename}")
        return result
        
    except Exception as e:
        error_msg = f"重命名文件时出错: {str(e)}"
        logger.error(f"文件重命名失败 ({file_index + 1}/{total_files}): {os.path.basename(file_path)} - {error_msg}")
        
        return {
            "original_path": file_path,
            "original_filename": os.path.basename(file_path),
            "new_filename": os.path.basename(file_path),  # 失败时保持原文件名
            "status": "error",
            "error": error_msg
        }


def rename_files_batch(file_paths: List[str], llm: ChatOpenAI, api_key: str, is_local: bool,
                      custom_prompt: str = "", max_chars: int = 5000,
                      keep_extension: bool = True,
                      image_recognition_method: ImageRecognitionMethod = "tesseract",
                      max_workers: int = 5,
                      progress_callback: Optional[Callable] = None,
                      status_callback: Optional[Callable] = None, language: str = "Chinese", 
                      online_model: str = None, api_base: str = None, model_name: str = None) -> List[dict]:
    """批量重命名文件，使用多线程提高速度
    
    Args:
        file_paths: 文件路径列表
        llm: LLM模型
        api_key: API密钥
        is_local: 是否使用本地模型
        custom_prompt: 自定义提示词
        max_chars: 最大字符数
        keep_extension: 是否保留扩展名
        image_recognition_method: 图片识别方法
        max_workers: 最大线程数
        progress_callback: 进度回调函数
        status_callback: 状态回调函数，用于实时更新单个文件的重命名状态
        
    Returns:
        List[dict]: 重命名结果列表
    """
    if not file_paths:
        return []
    
    total_files = len(file_paths)
    logger.info(f"开始批量重命名 {total_files} 个文件，使用 {max_workers} 个线程")
    
    # 结果存储
    results = [None] * total_files
    results_lock = threading.Lock()
    
    # 进度跟踪
    completed_count = 0
    progress_lock = threading.Lock()
    
    def update_progress_and_status(result: dict, index: int):
        """更新进度和状态的线程安全函数"""
        nonlocal completed_count
        
        with results_lock:
            results[index] = result
        
        with progress_lock:
            completed_count += 1
            
        # 调用状态回调（用于实时更新UI）
        if status_callback:
            try:
                status_callback(result["original_path"], result)
            except Exception as e:
                logger.error(f"状态回调出错: {str(e)}")
        
        # 调用进度回调
        if progress_callback:
            try:
                progress = int(completed_count / total_files * 100)
                progress_callback(progress)
            except Exception as e:
                logger.error(f"进度回调出错: {str(e)}")
    
    def worker():
        """工作线程函数"""
        while True:
            try:
                # 从队列获取任务
                task = task_queue.get(timeout=1)
                if task is None:  # 结束信号
                    break
                
                file_index, file_path = task
                
                # 执行重命名
                result = rename_single_file_thread_safe(
                    file_path=file_path,
                    llm=llm,
                    api_key=api_key,
                    is_local=is_local,
                    custom_prompt=custom_prompt,
                    max_chars=max_chars,
                    keep_extension=keep_extension,
                    image_recognition_method=image_recognition_method,
                    file_index=file_index,
                    total_files=total_files,
                    language=language,
                    online_model=online_model,
                    api_base=api_base,
                    model_name=model_name
                )
                
                # 更新进度和状态
                update_progress_and_status(result, file_index)
                
                task_queue.task_done()
                
            except queue.Empty:
                continue
            except Exception as e:
                logger.error(f"工作线程出错: {str(e)}")
                # 创建错误结果
                if 'file_index' in locals() and 'file_path' in locals():
                    error_result = {
                        "original_path": file_path,
                        "original_filename": os.path.basename(file_path),
                        "new_filename": os.path.basename(file_path),
                        "status": "error",
                        "error": f"线程执行出错: {str(e)}"
                    }
                    update_progress_and_status(error_result, file_index)
                task_queue.task_done()
    
    # 创建任务队列
    task_queue = queue.Queue()
    
    # 添加任务到队列
    for i, file_path in enumerate(file_paths):
        task_queue.put((i, file_path))
    
    # 创建并启动工作线程
    threads = []
    for _ in range(min(max_workers, total_files)):
        thread = threading.Thread(target=worker)
        thread.daemon = True
        thread.start()
        threads.append(thread)
    
    # 等待所有任务完成
    task_queue.join()
    
    # 停止所有线程
    for _ in threads:
        task_queue.put(None)
    
    # 🚀 优化：减少线程等待时间，避免卡死
    for thread in threads:
        thread.join(timeout=3)  # 从5秒减少到3秒
        if thread.is_alive():
            logger.warning(f"重命名线程 {thread.name} 仍在运行，强制继续")
    
    # 确保所有结果都已收集
    with results_lock:
        final_results = [r for r in results if r is not None]
    
    # 统计结果
    success_count = sum(1 for r in final_results if r["status"] == "success")
    error_count = len(final_results) - success_count
    
    logger.info(f"批量重命名完成: 成功 {success_count} 个，失败 {error_count} 个")
    
    return final_results


def calculate_content_hash(file_path: str, timeout: int = 30) -> str:
    """计算文件内容的哈希值
    
    Args:
        file_path: 文件路径
        timeout: 超时时间（秒）
        
    Returns:
        str: 文件内容的MD5哈希值
    """
    import signal
    
    def timeout_handler(signum, frame):
        raise TimeoutError("计算文件哈希值超时")
    
    hash_md5 = hashlib.md5()
    try:
        # 检查文件大小，避免处理过大的文件
        file_size = os.path.getsize(file_path)
        if file_size > 500 * 1024 * 1024:  # 500MB 限制
            logger.warning(f"文件过大({file_size / 1024 / 1024:.1f}MB)，跳过哈希计算: {file_path}")
            return ""
        
        # 设置超时处理（仅在非Windows系统上）
        if hasattr(signal, 'SIGALRM'):
            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(timeout)
        
        start_time = time.time()
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
                # 在Windows系统上手动检查超时
                if not hasattr(signal, 'SIGALRM') and time.time() - start_time > timeout:
                    logger.warning(f"文件哈希计算超时({timeout}秒): {file_path}")
                    return ""
        
        # 取消超时设置
        if hasattr(signal, 'SIGALRM'):
            signal.alarm(0)
            
        return hash_md5.hexdigest()
    except TimeoutError:
        logger.warning(f"文件哈希计算超时({timeout}秒): {file_path}")
        return ""
    except Exception as e:
        logger.error(f"计算文件哈希值时出错 {file_path}: {str(e)}")
        return ""
    finally:
        # 确保清理超时设置
        if hasattr(signal, 'SIGALRM'):
            signal.alarm(0)

class FolderClassification(BaseModel):
    category: str = Field(description="文件夹的主要分类类别，应该是一个具体且具有概括性的类别名称")
    subcategory: str = Field(description="文件夹的详细子类别，应该是一个具体的分类名称，必须与文件夹名称高度相关")
    confidence: float = Field(description="分类的置信度，范围在0到1之间")


def scan_folder_structure(root_path: str) -> List[dict]:
    """
    扫描指定路径下的文件夹结构
    
    Args:
        root_path: 根目录路径
        
    Returns:
        List[dict]: 文件夹信息列表，每个元素包含folder_path, folder_name, relative_path
    """
    folders = []
    
    for root, dirs, files in os.walk(root_path):
        for dir_name in dirs:
            folder_path = os.path.join(root, dir_name)
            relative_path = os.path.relpath(folder_path, root_path)
            
            folders.append({
                'folder_path': folder_path,
                'folder_name': dir_name,
                'relative_path': relative_path,
                'parent_path': root
            })
    
    return folders


@timeout_decorator(timeout_seconds=600)
def safe_classify_folder(folder_name: str, llm: ChatOpenAI, custom_prompt: str = None, 
                        relative_path: str = None, reference_items: dict = None, 
                        language: str = "Chinese") -> FolderClassification:
    """安全的文件夹分类函数，带有超时保护"""
    return classify_folder(folder_name, llm, custom_prompt, relative_path, reference_items, language)


def classify_folder(folder_name: str, llm: ChatOpenAI, custom_prompt: str = None, 
                   relative_path: str = None, reference_items: dict = None, 
                   language: str = "Chinese") -> FolderClassification:
    """
    对文件夹进行分类
    
    Args:
        folder_name: 文件夹名称
        llm: 语言模型实例
        custom_prompt: 自定义提示词
        relative_path: 相对路径
        reference_items: 参考项
        language: 语言设置
        
    Returns:
        FolderClassification: 分类结果
    """
    
    # 构建参考项信息
    reference_info = ""
    if reference_items:
        active_references = [key for key, value in reference_items.items() if value and key != 'custom_prompt']
        if active_references:
            if language == "Chinese":
                reference_info = f"\n\n特别注意以下参考维度：{', '.join(active_references)}"
            else:
                reference_info = f"\n\nSpecial attention to reference dimensions: {', '.join(active_references)}"
    
    # 构建自定义提示词
    custom_info = ""
    if custom_prompt and custom_prompt.strip():
        if language == "Chinese":
            custom_info = f"\n\n用户自定义要求：{custom_prompt.strip()}"
        else:
            custom_info = f"\n\nUser custom requirements: {custom_prompt.strip()}"
    
    # 构建路径信息
    path_info = ""
    if relative_path:
        if language == "Chinese":
            path_info = f"\n文件夹路径：{relative_path}"
        else:
            path_info = f"\nFolder path: {relative_path}"
    
    if language == "Chinese":
        system_prompt = f"""你是一个专业的文件夹分类助手。请根据文件夹名称对其进行合理的分类。

任务要求：
1. 主要分类(category)：提供一个概括性的主要类别，如"技术文档"、"办公文件"、"媒体资料"等
2. 详细子类别(subcategory)：提供一个更具体的子类别，应该与文件夹名称紧密相关
3. 置信度(confidence)：评估分类的准确性，范围0-1

分类原则：
- 根据文件夹名称的含义进行分类
- 考虑文件夹的层级结构和上下文
- 主要分类应该具有概括性，便于归档管理
- 子类别应该与文件夹名称高度匹配
- 避免创建过于具体或过于宽泛的分类{reference_info}{custom_info}

请对以下文件夹进行分类：
文件夹名称：{folder_name}{path_info}"""

        user_prompt = f"请对文件夹 '{folder_name}' 进行分类，并提供JSON格式的结果。"
    else:
        system_prompt = f"""You are a professional folder classification assistant. Please classify folders reasonably based on their names.

Task Requirements:
1. Main category: Provide a general main category, such as "Technical Documents", "Office Files", "Media Materials", etc.
2. Detailed subcategory: Provide a more specific subcategory that should be closely related to the folder name
3. Confidence: Evaluate the accuracy of classification, range 0-1

Classification Principles:
- Classify based on the meaning of folder names
- Consider the hierarchical structure and context of folders
- Main categories should be general and convenient for archival management
- Subcategories should highly match the folder names
- Avoid creating overly specific or overly broad classifications{reference_info}{custom_info}

Please classify the following folder:
Folder name: {folder_name}{path_info}"""

        user_prompt = f"Please classify the folder '{folder_name}' and provide results in JSON format."

    try:
        # 构建消息
        messages = [
            SystemMessage(content=system_prompt),
            HumanMessage(content=user_prompt)
        ]
        
        # 优先尝试结构化输出（仅OpenAI官方API支持）
        try:
            structured_llm = llm.with_structured_output(FolderClassification)
            
            # 调用模型
            def call_with_parser():
                return structured_llm.invoke(messages)
                
            # 使用重试机制
            result = retry_on_rate_limit(call_with_parser, max_retries=3, base_delay=2)
            
            if result and hasattr(result, 'category') and hasattr(result, 'subcategory'):
                return result
        except Exception as struct_error:
            # 检查是否是response_format不支持的错误
            error_msg = str(struct_error)
            if 'response_format' in error_msg.lower() or 'unavailable' in error_msg.lower():
                logger.warning(f"结构化输出不支持，降级为手动解析: {error_msg}")
            else:
                # 其他错误也记录但继续尝试降级
                logger.warning(f"结构化输出失败，尝试降级: {error_msg}")
        
        # 降级处理：使用非结构化调用
        response = llm.invoke(messages)
        response_text = response.content.strip()
        
        # 尝试解析JSON
        try:
            import re
            # 尝试提取JSON内容，支持多行
            json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group()
                data = json.loads(json_str)
                return FolderClassification(
                    category=data.get('category', '其他' if language == "Chinese" else 'Other'),
                    subcategory=data.get('subcategory', folder_name),
                    confidence=float(data.get('confidence', 0.5))
                )
        except Exception as parse_error:
            logger.warning(f"JSON解析失败: {parse_error}, 响应内容: {response_text[:200]}")
        
        # 如果JSON解析失败，尝试从文本中提取信息
        try:
            import re
            # 尝试从文本中提取category和subcategory
            category_match = re.search(r'(?:category|类别|主要分类)[:\s]*["\']?([^"\',\n]+)["\']?', response_text, re.IGNORECASE)
            subcategory_match = re.search(r'(?:subcategory|子类别|详细子类别)[:\s]*["\']?([^"\',\n]+)["\']?', response_text, re.IGNORECASE)
            
            if category_match:
                category = category_match.group(1).strip()
                subcategory = subcategory_match.group(1).strip() if subcategory_match else folder_name
                return FolderClassification(
                    category=category,
                    subcategory=subcategory,
                    confidence=0.5
                )
        except:
            pass
        
        # 默认分类
        return FolderClassification(
            category='其他' if language == "Chinese" else 'Other',
            subcategory=folder_name,
            confidence=0.5
        )
            
    except Exception as e:
        logger.error(f"文件夹分类失败 {folder_name}: {str(e)}")
        return FolderClassification(
            category='其他' if language == "Chinese" else 'Other',
            subcategory=folder_name,
            confidence=0.3
        )


def process_folder_classification(folders_to_process: List[dict], output_base_dir: str, 
                                 custom_prompt: str = None, reference_items: dict = None,
                                 llm_type: LLMType = "openai", model_name: str = "gpt-3.5-turbo",
                                 api_base: Optional[str] = None, api_key: Optional[str] = None,
                                 max_workers: int = 4, progress_callback: Optional[Callable] = None,
                                 language: str = "Chinese") -> dict:
    """
    处理文件夹分类
    
    Args:
        folders_to_process: 需要处理的文件夹列表
        output_base_dir: 输出基础目录
        custom_prompt: 自定义提示词
        reference_items: 参考项
        llm_type: LLM类型
        model_name: 模型名称
        api_base: API基础URL
        api_key: API密钥
        max_workers: 最大线程数
        progress_callback: 进度回调函数
        language: 语言设置
        
    Returns:
        dict: 分类结果
    """
    
    if not folders_to_process:
        logger.warning("没有文件夹需要处理")
        return {"categories": {}, "unclassified": []}
    
    # 初始化LLM
    try:
        llm = create_robust_llm(
            llm_type=llm_type,
            model_name=model_name,
            api_base=api_base,
            api_key=api_key,
            max_retries=3,
            timeout=120
        )
        logger.info(f"LLM初始化成功: {llm_type}/{model_name}")
    except Exception as llm_error:
        logger.error(f"LLM初始化失败: {str(llm_error)}")
        return {
            "categories": {},
            "unclassified": [
                {
                    "original_path": folder_info["folder_path"],
                    "folder_name": folder_info["folder_name"],
                    "error": f"LLM初始化失败: {str(llm_error)}"
                } for folder_info in folders_to_process
            ]
        }
    
    # 创建输出基础目录
    os.makedirs(output_base_dir, exist_ok=True)
    
    # 存储分类结果
    organized_folders = {
        "categories": {},
        "unclassified": []
    }
    
    total_folders = len(folders_to_process)
    processed_count = 0
    
    from concurrent.futures import ThreadPoolExecutor, as_completed
    import threading
    
    results_lock = threading.Lock()
    
    def process_single_folder(folder_info: dict) -> dict:
        """处理单个文件夹的分类"""
        try:
            folder_name = folder_info["folder_name"]
            folder_path = folder_info["folder_path"]
            relative_path = folder_info.get("relative_path", "")
            
            # 调用分类函数
            classification = safe_classify_folder(
                folder_name=folder_name,
                llm=llm,
                custom_prompt=custom_prompt,
                relative_path=relative_path,
                reference_items=reference_items,
                language=language
            )
            
            return {
                "status": "success",
                "original_path": folder_path,
                "folder_name": folder_name,
                "relative_path": relative_path,
                "category": classification.category,
                "subcategory": classification.subcategory,
                "confidence": classification.confidence
            }
            
        except Exception as e:
            logger.error(f"处理文件夹 {folder_info.get('folder_name', '未知')} 时出错: {str(e)}")
            return {
                "status": "error",
                "original_path": folder_info.get("folder_path", ""),
                "folder_name": folder_info.get("folder_name", ""),
                "error": str(e)
            }
    
    # 使用线程池处理文件夹分类，采用批处理方式避免UI阻塞
    batch_size = min(5, max(1, total_folders // 4))  # 动态计算批次大小
    logger.info(f"使用批处理模式，批次大小: {batch_size}")
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 分批处理文件夹
        for i in range(0, total_folders, batch_size):
            batch = folders_to_process[i:i + batch_size]
            batch_futures = {
                executor.submit(process_single_folder, folder_info): folder_info
                for folder_info in batch
            }
            
            # 处理当前批次的完成任务
            for future in as_completed(batch_futures):
                result = future.result()
                
                with results_lock:
                    processed_count += 1
                    
                    if result["status"] == "success":
                        category = result["category"]
                        subcategory = result["subcategory"]
                        
                        # 清理分类名和子分类名，移除Windows不允许的字符
                        from lib.common import Common
                        category = Common.sanitize_filename(category)
                        subcategory = Common.sanitize_filename(subcategory)
                        
                        # 初始化分类结构
                        if category not in organized_folders["categories"]:
                            organized_folders["categories"][category] = {
                                "subcategories": {}
                            }
                        
                        if subcategory not in organized_folders["categories"][category]["subcategories"]:
                            organized_folders["categories"][category]["subcategories"][subcategory] = {
                                "folders": []
                            }
                        
                        # 添加文件夹信息
                        organized_folders["categories"][category]["subcategories"][subcategory]["folders"].append({
                            "original_path": result["original_path"],
                            "folder_name": result["folder_name"],
                            "relative_path": result["relative_path"],
                            "confidence": result["confidence"]
                        })
                        
                    else:
                        # 添加到未分类
                        organized_folders["unclassified"].append({
                            "original_path": result["original_path"],
                            "folder_name": result["folder_name"],
                            "error": result.get("error", "未知错误")
                        })
                    
                    # 更新进度
                    if progress_callback:
                        try:
                            progress = int((processed_count / total_folders) * 100)
                            progress_callback(processed_count)
                            logger.info(f"文件夹分类进度: {processed_count}/{total_folders} ({progress}%)")
                            
                            # 添加小延迟让UI有机会更新
                            import time
                            time.sleep(0.01)  # 10ms延迟
                            
                        except Exception as progress_error:
                            logger.error(f"更新进度时出错: {str(progress_error)}")
            
            # 每批次处理完后稍作延迟，给UI更多更新机会
            import time
            time.sleep(0.05)  # 50ms延迟
            logger.info(f"完成批次 {i//batch_size + 1}/{(total_folders + batch_size - 1)//batch_size}")
    
    logger.info(f"文件夹分类完成，处理了 {processed_count} 个文件夹")
    return organized_folders


def move_or_copy_folders(organized_folders: dict, output_base_dir: str, is_copy: bool = False, 
                        progress_callback: Optional[Callable] = None) -> dict:
    """
    根据分类结果移动或复制文件夹
    
    Args:
        organized_folders: 分类结果
        output_base_dir: 输出基础目录
        is_copy: True表示复制，False表示移动
        progress_callback: 进度回调函数
        
    Returns:
        dict: 操作结果
    """
    import shutil
    
    operation_results = {
        "success": [],
        "failed": []
    }
    
    # 统计总数
    total_operations = 0
    for category_data in organized_folders["categories"].values():
        for subcategory_data in category_data["subcategories"].values():
            total_operations += len(subcategory_data["folders"])
    
    processed_count = 0
    
    for category, category_data in organized_folders["categories"].items():
        for subcategory, subcategory_data in category_data["subcategories"].items():
            # 清理分类名和子分类名，移除Windows不允许的字符
            from lib.common import Common
            clean_category = Common.sanitize_filename(category)
            clean_subcategory = Common.sanitize_filename(subcategory)
            
            # 创建目标目录
            target_category_dir = os.path.join(output_base_dir, clean_category)
            target_subcategory_dir = os.path.join(target_category_dir, clean_subcategory)
            
            try:
                os.makedirs(target_subcategory_dir, exist_ok=True)
            except Exception as e:
                logger.error(f"创建目录失败 {target_subcategory_dir}: {str(e)}")
                continue
            
            for folder_info in subcategory_data["folders"]:
                try:
                    source_path = folder_info["original_path"]
                    folder_name = folder_info["folder_name"]
                    target_path = os.path.join(target_subcategory_dir, folder_name)
                    
                    # 检查源路径是否存在
                    if not os.path.exists(source_path):
                        logger.warning(f"源文件夹不存在: {source_path}")
                        operation_results["failed"].append({
                            "source": source_path,
                            "target": target_path,
                            "error": "源文件夹不存在"
                        })
                        continue
                    
                    # 如果目标路径已存在，生成唯一名称
                    if os.path.exists(target_path):
                        counter = 1
                        base_name = folder_name
                        while os.path.exists(target_path):
                            new_name = f"{base_name}_{counter}"
                            target_path = os.path.join(target_subcategory_dir, new_name)
                            counter += 1
                    
                    # 执行移动或复制操作
                    if is_copy:
                        shutil.copytree(source_path, target_path)
                        logger.info(f"复制文件夹成功: {source_path} -> {target_path}")
                    else:
                        shutil.move(source_path, target_path)
                        logger.info(f"移动文件夹成功: {source_path} -> {target_path}")
                    
                    operation_results["success"].append({
                        "source": source_path,
                        "target": target_path,
                        "operation": "copy" if is_copy else "move"
                    })
                    
                except Exception as e:
                    logger.error(f"{'复制' if is_copy else '移动'}文件夹失败 {source_path}: {str(e)}")
                    operation_results["failed"].append({
                        "source": source_path,
                        "target": target_path if 'target_path' in locals() else "未知",
                        "error": str(e)
                    })
                
                finally:
                    processed_count += 1
                    if progress_callback:
                        try:
                            progress_callback(processed_count)
                        except Exception as progress_error:
                            logger.error(f"更新进度时出错: {str(progress_error)}")
    
    logger.info(f"文件夹操作完成: 成功 {len(operation_results['success'])} 个，失败 {len(operation_results['failed'])} 个")
    return operation_results


def deduplicate_files_by_content(files_to_process: List[dict], progress_callback: Optional[Callable] = None) -> List[dict]:
    """基于文件内容进行去重
    
    Args:
        files_to_process: 待处理的文件列表
        progress_callback: 进度回调函数
        
    Returns:
        List[dict]: 去重后的文件列表
    """
    if not files_to_process:
        return files_to_process
    
    logger.info(f"开始文件去重，原始文件数量: {len(files_to_process)}")
    
    # 计算所有文件的哈希值
    hash_to_files = {}
    unique_files = []
    
    total_files = len(files_to_process)
    processed_count = 0
    
    for file_info in files_to_process:
        # 修正键名：使用 'file_path' 而不是 'path'
        file_path = file_info.get('file_path') or file_info.get('path', '')
        
        if not file_path:
            logger.warning(f"文件信息中缺少路径信息: {file_info}")
            unique_files.append(file_info)
            continue
        
        # 检查文件是否存在
        if not os.path.exists(file_path):
            logger.warning(f"文件不存在，跳过去重: {file_path}")
            unique_files.append(file_info)
            continue
        
        processed_count += 1
        if processed_count % 5 == 0 or processed_count == total_files:
            progress_percent = processed_count / total_files * 100
            logger.info(f"文件去重进度: {processed_count}/{total_files} ({progress_percent:.1f}%)")
            
            # 调用进度回调
            if progress_callback:
                # 去重进度占总进度的5%，在1-6%之间
                total_progress = 1 + (progress_percent / 100) * 5
                progress_callback(int(total_progress))
        
        try:
            # 计算文件哈希值，设置较短的超时时间
            file_hash = calculate_content_hash(file_path, timeout=15)
            
            if not file_hash:
                # 如果无法计算哈希值，保留文件
                logger.debug(f"无法计算哈希值，保留文件: {file_path}")
                unique_files.append(file_info)
                continue
            
            if file_hash not in hash_to_files:
                # 第一次遇到这个哈希值，保留文件
                hash_to_files[file_hash] = []
                unique_files.append(file_info)
            
            # 记录所有具有相同哈希值的文件
            hash_to_files[file_hash].append(file_info)
            
        except Exception as e:
            logger.error(f"处理文件时出错 {file_path}: {str(e)}")
            # 出错时保留文件，避免意外丢失
            unique_files.append(file_info)
    
    # 记录去重结果
    removed_count = len(files_to_process) - len(unique_files)
    if removed_count > 0:
        logger.info(f"文件去重完成，移除了 {removed_count} 个重复文件")
        
        # 记录重复文件信息
        for file_hash, files in hash_to_files.items():
            if len(files) > 1:
                logger.info(f"发现重复文件组 (哈希: {file_hash[:8]}...):")
                for i, file_info in enumerate(files):
                    status = "保留" if i == 0 else "移除"
                    file_path = file_info.get('file_path') or file_info.get('path', '未知路径')
                    logger.info(f"  {status}: {file_path}")
    else:
        logger.info("未发现重复文件")
    
    return unique_files

def select_file_to_keep(duplicate_files: List[dict]) -> dict:
    """从重复文件中选择要保留的文件
    
    选择策略：
    1. 优先选择文件名更有意义的文件
    2. 优先选择路径更短的文件
    3. 优先选择修改时间更新的文件
    
    Args:
        duplicate_files: 重复文件列表
        
    Returns:
        dict: 选择保留的文件信息
    """
    if not duplicate_files:
        return None
    
    if len(duplicate_files) == 1:
        return duplicate_files[0]
    
    # 按优先级排序
    def file_priority(file_info):
        file_path = file_info.get('file_path') or file_info.get('path', '')
        filename = os.path.basename(file_path)
        
        # 文件名评分（避免数字文件名、临时文件名等）
        name_score = 0
        if not re.match(r'^\d+\.(jpg|jpeg|png|pdf|doc|docx)$', filename.lower()):
            name_score += 10
        if 'copy' not in filename.lower() and '副本' not in filename:
            name_score += 5
        if 'temp' not in filename.lower() and '临时' not in filename:
            name_score += 5
        
        # 路径长度评分（路径越短越好）
        path_score = max(0, 100 - len(file_path))
        
        # 修改时间评分
        try:
            mtime = os.path.getmtime(file_path)
            time_score = mtime / 1000000  # 归一化时间戳
        except:
            time_score = 0
        
        return name_score + path_score * 0.1 + time_score * 0.001
    
    # 选择评分最高的文件
    best_file = max(duplicate_files, key=file_priority)
    return best_file


if __name__ == "__main__":
    # 设置你的OpenAI API密钥
    pass