# -*- coding: utf-8 -*-
"""
智能文件分析模块 - FileNeatAI 智能化核心
提供多模态文件分析、用户偏好学习、上下文感知等功能
"""

import os
import re
import json
import hashlib
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
from collections import defaultdict, Counter
from logger import logger


class FileMetadataExtractor:
    """文件元数据提取器"""
    
    @staticmethod
    def extract_metadata(file_path: str) -> Dict[str, Any]:
        """提取文件的完整元数据"""
        try:
            stat = os.stat(file_path)
            file_size = stat.st_size
            
            return {
                'size': file_size,
                'size_mb': round(file_size / (1024 * 1024), 2),
                'created': datetime.fromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S'),
                'modified': datetime.fromtimestamp(stat.st_mtime).strftime('%Y-%m-%d %H:%M:%S'),
                'extension': Path(file_path).suffix.lower(),
                'basename': Path(file_path).stem,
                'is_hidden': Path(file_path).name.startswith('.'),
                'directory': str(Path(file_path).parent),
                'depth': len(Path(file_path).parts) - 1
            }
        except Exception as e:
            logger.error(f"提取文件元数据失败 {file_path}: {str(e)}")
            return {}


class FilenameSemanticAnalyzer:
    """文件名语义分析器"""
    
    def __init__(self):
        # 预定义的语义模式
        self.patterns = {
            'date_patterns': [
                r'\d{4}-\d{2}-\d{2}',  # 2024-01-01
                r'\d{4}\d{2}\d{2}',    # 20240101
                r'\d{2}-\d{2}-\d{4}',  # 01-01-2024
                r'\d{4}年\d{1,2}月\d{1,2}日',  # 2024年1月1日
            ],
            'version_patterns': [
                r'v\d+\.\d+(\.\d+)?',  # v1.0, v1.0.1
                r'版本\d+',            # 版本1
                r'_v\d+',              # _v1
                r'(final|draft|review|修改|最终|草稿)',
                r'(\d+\.\d+)',         # 1.0
            ],
            'project_patterns': [
                r'(项目|project|proj)',
                r'(方案|plan|proposal|scheme)',
                r'(报告|report|rpt)',
                r'(合同|contract|agreement)',
                r'(会议|meeting|会纪要)',
                r'(培训|training|train)',
                r'(需求|requirement|req)',
                r'(设计|design|des)',
            ],
            'document_type_patterns': [
                r'(简历|resume|cv)',
                r'(发票|invoice|bill)',
                r'(收据|receipt)',
                r'(证书|certificate|cert)',
                r'(手册|manual|guide)',
                r'(教程|tutorial|tut)',
                r'(说明|instruction|readme)',
            ],
            'status_patterns': [
                r'(待办|todo|pending)',
                r'(完成|done|finished|completed)',
                r'(进行中|ongoing|inprogress)',
                r'(已取消|cancelled|canceled)',
                r'(暂停|paused|suspended)',
            ]
        }
    
    def analyze_filename(self, filename: str) -> Dict[str, Any]:
        """分析文件名的语义特征"""
        filename_lower = filename.lower()
        
        analysis = {
            'has_date': False,
            'has_version': False,
            'has_project_indicator': False,
            'document_type': None,
            'status_indicator': None,
            'keywords': [],
            'semantic_score': 0.0,
            'language': self._detect_language(filename),
            'naming_style': self._detect_naming_style(filename)
        }
        
        # 检测各种模式
        for pattern_type, patterns in self.patterns.items():
            for pattern in patterns:
                matches = re.findall(pattern, filename, re.IGNORECASE)
                if matches:
                    if pattern_type == 'date_patterns':
                        analysis['has_date'] = True
                        analysis['semantic_score'] += 0.2
                    elif pattern_type == 'version_patterns':
                        analysis['has_version'] = True
                        analysis['semantic_score'] += 0.15
                    elif pattern_type == 'project_patterns':
                        analysis['has_project_indicator'] = True
                        analysis['semantic_score'] += 0.25
                    elif pattern_type == 'document_type_patterns':
                        analysis['document_type'] = matches[0]
                        analysis['semantic_score'] += 0.3
                    elif pattern_type == 'status_patterns':
                        analysis['status_indicator'] = matches[0]
                        analysis['semantic_score'] += 0.1
                    
                    analysis['keywords'].extend(matches)
        
        # 提取其他关键词
        additional_keywords = self._extract_keywords(filename)
        analysis['keywords'].extend(additional_keywords)
        
        return analysis
    
    def _detect_language(self, filename: str) -> str:
        """检测文件名的主要语言"""
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', filename))
        total_chars = len(filename)
        
        if chinese_chars > total_chars * 0.3:
            return 'chinese'
        else:
            return 'english'
    
    def _detect_naming_style(self, filename: str) -> str:
        """检测文件命名风格"""
        if '_' in filename:
            return 'snake_case'
        elif '-' in filename:
            return 'kebab-case'
        elif any(c.isupper() for c in filename[1:]):
            return 'camelCase'
        else:
            return 'lowercase'
    
    def _extract_keywords(self, filename: str) -> List[str]:
        """提取文件名中的关键词"""
        # 分割文件名
        separators = r'[-_\s\.]'
        parts = re.split(separators, filename)
        
        # 过滤掉太短的部分和数字
        keywords = []
        for part in parts:
            if len(part) > 2 and not part.isdigit():
                keywords.append(part.lower())
        
        return keywords


class ContentTopicExtractor:
    """内容主题提取器"""
    
    def __init__(self):
        # 预定义的主题关键词
        self.topic_keywords = {
            '财务': ['财务', '会计', '发票', '收据', '账单', '报销', '预算', '成本', 'finance', 'accounting', 'invoice', 'receipt', 'budget'],
            '技术文档': ['API', '技术', '开发', '代码', '程序', '系统', '架构', 'technical', 'development', 'code', 'system'],
            '合同法务': ['合同', '协议', '法律', '条款', '签署', '甲方', '乙方', 'contract', 'agreement', 'legal', 'terms'],
            '人事管理': ['人事', '员工', '招聘', '简历', '绩效', '薪资', 'HR', 'employee', 'recruitment', 'resume', 'salary'],
            '市场营销': ['市场', '营销', '推广', '客户', '销售', '品牌', 'marketing', 'promotion', 'customer', 'sales', 'brand'],
            '项目管理': ['项目', '计划', '进度', '里程碑', '需求', '风险', 'project', 'plan', 'schedule', 'milestone', 'requirement'],
            '培训教育': ['培训', '教育', '课程', '学习', '教材', '考试', 'training', 'education', 'course', 'learning', 'exam'],
            '会议记录': ['会议', '纪要', '议程', '决议', '讨论', '参会', 'meeting', 'minutes', 'agenda', 'discussion'],
        }
    
    def extract_topics(self, content: str, max_topics: int = 3) -> List[Tuple[str, float]]:
        """从内容中提取主要主题"""
        if not content or len(content.strip()) < 10:
            return []
        
        content_lower = content.lower()
        topic_scores = {}
        
        # 计算每个主题的得分
        for topic, keywords in self.topic_keywords.items():
            score = 0
            for keyword in keywords:
                # 计算关键词出现次数，考虑权重
                count = len(re.findall(re.escape(keyword.lower()), content_lower))
                if count > 0:
                    # 关键词长度越长，权重越高
                    weight = len(keyword) / 10.0
                    score += count * weight
            
            if score > 0:
                # 归一化得分
                topic_scores[topic] = min(score / len(content) * 1000, 1.0)
        
        # 返回得分最高的主题
        sorted_topics = sorted(topic_scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_topics[:max_topics]
    
    def identify_document_type(self, content: str) -> Optional[str]:
        """识别文档类型"""
        content_lower = content.lower()
        
        type_indicators = {
            '简历': ['个人信息', '教育背景', '工作经验', '技能', '联系方式'],
            '合同': ['甲方', '乙方', '签署', '条款', '违约', '生效'],
            '报告': ['摘要', '背景', '分析', '结论', '建议'],
            '会议纪要': ['会议时间', '参会人员', '议程', '决议', '行动项'],
            '技术文档': ['API', '接口', '参数', '返回值', '示例代码'],
            '财务文档': ['金额', '税率', '发票号', '账户', '付款方式'],
        }
        
        max_score = 0
        best_type = None
        
        for doc_type, indicators in type_indicators.items():
            score = sum(1 for indicator in indicators if indicator in content_lower)
            if score > max_score:
                max_score = score
                best_type = doc_type
        
        return best_type if max_score > 1 else None


class ContextAnalyzer:
    """上下文分析器"""
    
    def analyze_directory_context(self, directory_files: List[str], current_file: str) -> Dict[str, Any]:
        """分析目录上下文"""
        if not directory_files:
            return {}
        
        context = {
            'total_files': len(directory_files),
            'file_types': self._analyze_file_types(directory_files),
            'naming_patterns': self._analyze_naming_patterns(directory_files),
            'project_indicators': self._find_project_indicators(directory_files),
            'related_files': self._find_related_files(directory_files, current_file),
            'directory_purpose': self._infer_directory_purpose(directory_files)
        }
        
        return context
    
    def _analyze_file_types(self, files: List[str]) -> Dict[str, int]:
        """分析文件类型分布"""
        extensions = [Path(f).suffix.lower() for f in files if Path(f).suffix]
        return dict(Counter(extensions))
    
    def _analyze_naming_patterns(self, files: List[str]) -> Dict[str, Any]:
        """分析命名模式"""
        patterns = {
            'has_dates': 0,
            'has_versions': 0,
            'common_prefixes': [],
            'common_suffixes': []
        }
        
        prefixes = []
        suffixes = []
        
        for file in files:
            basename = Path(file).stem
            
            # 检查日期模式
            if re.search(r'\d{4}-\d{2}-\d{2}|\d{8}', basename):
                patterns['has_dates'] += 1
            
            # 检查版本模式
            if re.search(r'v\d+\.\d+|版本\d+|_v\d+', basename, re.I):
                patterns['has_versions'] += 1
            
            # 提取前缀和后缀
            parts = re.split(r'[-_\s]', basename)
            if len(parts) > 1:
                prefixes.append(parts[0])
                suffixes.append(parts[-1])
        
        # 找到常见的前缀和后缀
        if prefixes:
            prefix_counter = Counter(prefixes)
            patterns['common_prefixes'] = [p for p, c in prefix_counter.most_common(3) if c > 1]
        
        if suffixes:
            suffix_counter = Counter(suffixes)
            patterns['common_suffixes'] = [s for s, c in suffix_counter.most_common(3) if c > 1]
        
        return patterns
    
    def _find_project_indicators(self, files: List[str]) -> List[str]:
        """查找项目指示符"""
        indicators = []
        
        # 查找项目配置文件
        project_files = [
            'package.json', 'requirements.txt', 'pom.xml', 'Makefile',
            'README.md', 'README.txt', '.gitignore', 'LICENSE'
        ]
        
        for file in files:
            basename = Path(file).name.lower()
            if basename in [pf.lower() for pf in project_files]:
                indicators.append(f"项目文件: {basename}")
        
        # 查找常见的项目目录结构
        directories = set()
        for file in files:
            parts = Path(file).parts
            if len(parts) > 1:
                directories.add(parts[0])
        
        common_project_dirs = ['src', 'lib', 'docs', 'test', 'config', 'assets']
        for dir_name in directories:
            if dir_name.lower() in common_project_dirs:
                indicators.append(f"项目目录: {dir_name}")
        
        return indicators
    
    def _find_related_files(self, files: List[str], current_file: str) -> List[str]:
        """查找相关文件"""
        if not current_file:
            return []
        
        current_base = Path(current_file).stem.lower()
        related = []
        
        for file in files:
            if file == current_file:
                continue
                
            file_base = Path(file).stem.lower()
            
            # 基于文件名相似性
            similarity = self._calculate_name_similarity(current_base, file_base)
            if similarity > 0.6:
                related.append(file)
        
        return related[:5]  # 最多返回5个相关文件
    
    def _calculate_name_similarity(self, name1: str, name2: str) -> float:
        """计算文件名相似度"""
        # 简单的相似度计算
        name1_parts = set(re.split(r'[-_\s]', name1.lower()))
        name2_parts = set(re.split(r'[-_\s]', name2.lower()))
        
        if not name1_parts or not name2_parts:
            return 0.0
        
        intersection = name1_parts.intersection(name2_parts)
        union = name1_parts.union(name2_parts)
        
        return len(intersection) / len(union) if union else 0.0
    
    def _infer_directory_purpose(self, files: List[str]) -> Optional[str]:
        """推断目录用途"""
        if not files:
            return None
        
        # 基于文件类型和命名模式推断目录用途
        file_types = self._analyze_file_types(files)
        
        # 图片目录
        image_types = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
        if any(ext in image_types for ext in file_types.keys()):
            image_count = sum(count for ext, count in file_types.items() if ext in image_types)
            if image_count > len(files) * 0.7:
                return '图片目录'
        
        # 文档目录
        doc_types = {'.pdf', '.doc', '.docx', '.txt', '.md'}
        if any(ext in doc_types for ext in file_types.keys()):
            doc_count = sum(count for ext, count in file_types.items() if ext in doc_types)
            if doc_count > len(files) * 0.7:
                return '文档目录'
        
        # 代码目录
        code_types = {'.py', '.js', '.java', '.cpp', '.c', '.h', '.css', '.html'}
        if any(ext in code_types for ext in file_types.keys()):
            code_count = sum(count for ext, count in file_types.items() if ext in code_types)
            if code_count > len(files) * 0.5:
                return '代码目录'
        
        return '混合目录'


class UserPreferenceLearner:
    """用户偏好学习器"""
    
    def __init__(self, preferences_file: str = None):
        self.preferences_file = preferences_file or self._get_default_preferences_file()
        self.preferences = self._load_preferences()
    
    def _get_default_preferences_file(self) -> str:
        """获取默认偏好文件路径"""
        from lib.common import Common
        config_path = Common.getConfigPath()
        return os.path.join(config_path, 'user_preferences.json')
    
    def _load_preferences(self) -> Dict[str, Any]:
        """加载用户偏好"""
        try:
            if os.path.exists(self.preferences_file):
                with open(self.preferences_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
        except Exception as e:
            logger.error(f"加载用户偏好失败: {str(e)}")
        
        # 返回默认偏好结构
        return {
            'category_preferences': {},  # 分类偏好
            'naming_patterns': {},       # 命名模式偏好
            'file_type_mappings': {},    # 文件类型映射
            'classification_history': [],  # 分类历史
            'user_corrections': [],      # 用户修正记录
            'confidence_threshold': 0.7,  # 置信度阈值
            'last_updated': datetime.now().isoformat()
        }
    
    def save_preferences(self):
        """保存用户偏好"""
        try:
            self.preferences['last_updated'] = datetime.now().isoformat()
            os.makedirs(os.path.dirname(self.preferences_file), exist_ok=True)
            
            with open(self.preferences_file, 'w', encoding='utf-8') as f:
                json.dump(self.preferences, f, ensure_ascii=False, indent=2)
            
            logger.info("用户偏好已保存")
        except Exception as e:
            logger.error(f"保存用户偏好失败: {str(e)}")
    
    def learn_from_classification(self, file_info: Dict[str, Any], classification_result: Dict[str, Any]):
        """从分类结果中学习"""
        try:
            # 记录分类历史
            history_entry = {
                'timestamp': datetime.now().isoformat(),
                'filename': file_info.get('filename', ''),
                'file_type': file_info.get('extension', ''),
                'category': classification_result.get('category', ''),
                'subcategory': classification_result.get('subcategory', ''),
                'confidence': classification_result.get('confidence', 0.0)
            }
            
            self.preferences['classification_history'].append(history_entry)
            
            # 保持历史记录在合理范围内
            if len(self.preferences['classification_history']) > 1000:
                self.preferences['classification_history'] = self.preferences['classification_history'][-500:]
            
            # 更新分类偏好
            category = classification_result.get('category', '')
            if category:
                if category not in self.preferences['category_preferences']:
                    self.preferences['category_preferences'][category] = {'count': 0, 'confidence_sum': 0.0}
                
                self.preferences['category_preferences'][category]['count'] += 1
                self.preferences['category_preferences'][category]['confidence_sum'] += classification_result.get('confidence', 0.0)
            
            # 更新文件类型映射
            file_type = file_info.get('extension', '')
            if file_type and category:
                if file_type not in self.preferences['file_type_mappings']:
                    self.preferences['file_type_mappings'][file_type] = {}
                
                if category not in self.preferences['file_type_mappings'][file_type]:
                    self.preferences['file_type_mappings'][file_type][category] = 0
                
                self.preferences['file_type_mappings'][file_type][category] += 1
            
            self.save_preferences()
            
        except Exception as e:
            logger.error(f"学习分类偏好失败: {str(e)}")
    
    def learn_from_correction(self, original_classification: Dict[str, Any], 
                            corrected_classification: Dict[str, Any], 
                            file_info: Dict[str, Any]):
        """从用户修正中学习"""
        try:
            correction_entry = {
                'timestamp': datetime.now().isoformat(),
                'filename': file_info.get('filename', ''),
                'file_type': file_info.get('extension', ''),
                'original_category': original_classification.get('category', ''),
                'corrected_category': corrected_classification.get('category', ''),
                'original_subcategory': original_classification.get('subcategory', ''),
                'corrected_subcategory': corrected_classification.get('subcategory', ''),
                'correction_reason': 'user_manual_correction'
            }
            
            self.preferences['user_corrections'].append(correction_entry)
            
            # 保持修正记录在合理范围内
            if len(self.preferences['user_corrections']) > 200:
                self.preferences['user_corrections'] = self.preferences['user_corrections'][-100:]
            
            # 增强正确分类的权重
            corrected_category = corrected_classification.get('category', '')
            if corrected_category:
                if corrected_category not in self.preferences['category_preferences']:
                    self.preferences['category_preferences'][corrected_category] = {'count': 0, 'confidence_sum': 0.0}
                
                # 用户修正的分类给予更高权重
                self.preferences['category_preferences'][corrected_category]['count'] += 2
                self.preferences['category_preferences'][corrected_category]['confidence_sum'] += 1.0
            
            self.save_preferences()
            
        except Exception as e:
            logger.error(f"学习用户修正失败: {str(e)}")
    
    def predict_classification(self, file_info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """基于学习预测分类"""
        try:
            file_type = file_info.get('extension', '')
            filename = file_info.get('filename', '')
            
            # 基于文件类型的预测
            type_prediction = None
            if file_type in self.preferences['file_type_mappings']:
                type_mappings = self.preferences['file_type_mappings'][file_type]
                if type_mappings:
                    # 选择最常用的分类
                    best_category = max(type_mappings.items(), key=lambda x: x[1])
                    type_prediction = {
                        'category': best_category[0],
                        'confidence': min(best_category[1] / 10.0, 0.9),  # 基于使用频率计算置信度
                        'source': 'file_type_mapping'
                    }
            
            # 基于文件名模式的预测
            name_prediction = self._predict_by_filename_pattern(filename)
            
            # 选择最佳预测
            predictions = [p for p in [type_prediction, name_prediction] if p]
            if predictions:
                return max(predictions, key=lambda x: x['confidence'])
            
            return None
            
        except Exception as e:
            logger.error(f"预测分类失败: {str(e)}")
            return None
    
    def _predict_by_filename_pattern(self, filename: str) -> Optional[Dict[str, Any]]:
        """基于文件名模式预测"""
        if not filename:
            return None
        
        # 从历史记录中查找相似的文件名模式
        similar_classifications = []
        
        for history in self.preferences['classification_history']:
            hist_filename = history.get('filename', '')
            if hist_filename:
                similarity = self._calculate_filename_similarity(filename, hist_filename)
                if similarity > 0.6:  # 相似度阈值
                    similar_classifications.append({
                        'category': history['category'],
                        'confidence': history['confidence'] * similarity,
                        'similarity': similarity
                    })
        
        if similar_classifications:
            # 选择相似度和置信度最高的分类
            best_match = max(similar_classifications, key=lambda x: x['confidence'] * x['similarity'])
            return {
                'category': best_match['category'],
                'confidence': min(best_match['confidence'], 0.8),
                'source': 'filename_pattern'
            }
        
        return None
    
    def _calculate_filename_similarity(self, name1: str, name2: str) -> float:
        """计算文件名相似度"""
        # 提取关键词
        def extract_keywords(name):
            return set(re.findall(r'\w+', name.lower()))
        
        keywords1 = extract_keywords(name1)
        keywords2 = extract_keywords(name2)
        
        if not keywords1 or not keywords2:
            return 0.0
        
        intersection = keywords1.intersection(keywords2)
        union = keywords1.union(keywords2)
        
        return len(intersection) / len(union) if union else 0.0
    
    def get_user_preferences_summary(self) -> Dict[str, Any]:
        """获取用户偏好摘要"""
        try:
            summary = {
                'total_classifications': len(self.preferences['classification_history']),
                'total_corrections': len(self.preferences['user_corrections']),
                'favorite_categories': [],
                'file_type_preferences': {},
                'confidence_threshold': self.preferences.get('confidence_threshold', 0.7)
            }
            
            # 获取最喜欢的分类
            if self.preferences['category_preferences']:
                sorted_categories = sorted(
                    self.preferences['category_preferences'].items(),
                    key=lambda x: x[1]['count'],
                    reverse=True
                )
                summary['favorite_categories'] = [cat for cat, _ in sorted_categories[:5]]
            
            # 获取文件类型偏好
            summary['file_type_preferences'] = self.preferences['file_type_mappings']
            
            return summary
            
        except Exception as e:
            logger.error(f"获取用户偏好摘要失败: {str(e)}")
            return {}


class IntelligentFileAnalyzer:
    """智能文件分析器 - 整合所有分析功能"""
    
    def __init__(self):
        self.metadata_extractor = FileMetadataExtractor()
        self.filename_analyzer = FilenameSemanticAnalyzer()
        self.content_extractor = ContentTopicExtractor()
        self.context_analyzer = ContextAnalyzer()
        self.preference_learner = UserPreferenceLearner()
    
    def comprehensive_analysis(self, file_path: str, content: str = "", 
                             context_files: List[str] = None) -> Dict[str, Any]:
        """对文件进行综合智能分析"""
        try:
            logger.info(f"开始综合分析文件: {file_path}")
            
            # 1. 提取文件元数据
            metadata = self.metadata_extractor.extract_metadata(file_path)
            
            # 2. 分析文件名语义
            filename_analysis = self.filename_analyzer.analyze_filename(os.path.basename(file_path))
            
            # 3. 分析内容主题
            content_topics = []
            document_type = None
            if content:
                content_topics = self.content_extractor.extract_topics(content)
                document_type = self.content_extractor.identify_document_type(content)
            
            # 4. 分析上下文
            context_analysis = {}
            if context_files:
                context_analysis = self.context_analyzer.analyze_directory_context(
                    context_files, file_path
                )
            
            # 5. 获取用户偏好预测
            file_info = {
                'filename': os.path.basename(file_path),
                'extension': metadata.get('extension', ''),
                'size': metadata.get('size', 0)
            }
            preference_prediction = self.preference_learner.predict_classification(file_info)
            
            # 6. 综合分析结果
            analysis_result = {
                'file_path': file_path,
                'metadata': metadata,
                'filename_analysis': filename_analysis,
                'content_topics': content_topics,
                'document_type': document_type,
                'context_analysis': context_analysis,
                'preference_prediction': preference_prediction,
                'analysis_timestamp': datetime.now().isoformat(),
                'confidence_score': self._calculate_overall_confidence(
                    filename_analysis, content_topics, preference_prediction
                )
            }
            
            logger.info(f"文件分析完成: {file_path}, 置信度: {analysis_result['confidence_score']:.2f}")
            return analysis_result
            
        except Exception as e:
            logger.error(f"综合分析失败 {file_path}: {str(e)}")
            return {
                'file_path': file_path,
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }
    
    def _calculate_overall_confidence(self, filename_analysis: Dict[str, Any], 
                                    content_topics: List[Tuple[str, float]], 
                                    preference_prediction: Optional[Dict[str, Any]]) -> float:
        """计算整体置信度"""
        confidence = 0.0
        
        # 文件名分析贡献
        if filename_analysis:
            confidence += filename_analysis.get('semantic_score', 0.0) * 0.3
        
        # 内容主题贡献
        if content_topics:
            max_topic_score = max(score for _, score in content_topics)
            confidence += max_topic_score * 0.4
        
        # 用户偏好预测贡献
        if preference_prediction:
            confidence += preference_prediction.get('confidence', 0.0) * 0.3
        
        return min(confidence, 1.0)
    
    def generate_enhanced_classification_prompt(self, analysis_result: Dict[str, Any], 
                                              custom_prompt: str = "", 
                                              language: str = "Chinese") -> str:
        """基于分析结果生成增强的分类提示词"""
        
        prompt_parts = []
        
        # 基础角色定义
        if language == "Chinese":
            prompt_parts.append("""你是一个具有深度学习和多模态分析能力的智能文档分类专家。

核心能力：
1. **语义理解**: 深度理解文档的真实用途和语义含义
2. **上下文感知**: 分析文件与其环境的关系
3. **用户习惯学习**: 根据用户历史行为优化分类
4. **智能推理**: 基于多维度信息进行智能推理""")
        else:
            prompt_parts.append("""You are an intelligent document classification expert with deep learning and multi-modal analysis capabilities.

Core Capabilities:
1. **Semantic Understanding**: Deep understanding of document's true purpose and semantic meaning
2. **Context Awareness**: Analyze relationships between files and their environment
3. **User Habit Learning**: Optimize classification based on user historical behavior
4. **Intelligent Reasoning**: Make intelligent inferences based on multi-dimensional information""")
        
        # 文件分析信息
        metadata = analysis_result.get('metadata', {})
        filename_analysis = analysis_result.get('filename_analysis', {})
        content_topics = analysis_result.get('content_topics', [])
        context_analysis = analysis_result.get('context_analysis', {})
        preference_prediction = analysis_result.get('preference_prediction')
        
        # 添加文件基本信息
        if metadata:
            if language == "Chinese":
                prompt_parts.append(f"""
文件基本信息：
- 文件名: {metadata.get('basename', '')}
- 文件类型: {metadata.get('extension', '')}
- 文件大小: {metadata.get('size_mb', 0)}MB
- 创建时间: {metadata.get('created', '')}
- 修改时间: {metadata.get('modified', '')}""")
            else:
                prompt_parts.append(f"""
File Basic Information:
- Filename: {metadata.get('basename', '')}
- File Type: {metadata.get('extension', '')}
- File Size: {metadata.get('size_mb', 0)}MB
- Created: {metadata.get('created', '')}
- Modified: {metadata.get('modified', '')}""")
        
        # 添加文件名语义分析
        if filename_analysis:
            semantic_features = []
            if filename_analysis.get('has_date'):
                semantic_features.append("包含日期信息" if language == "Chinese" else "Contains date information")
            if filename_analysis.get('has_version'):
                semantic_features.append("包含版本信息" if language == "Chinese" else "Contains version information")
            if filename_analysis.get('has_project_indicator'):
                semantic_features.append("包含项目指示符" if language == "Chinese" else "Contains project indicators")
            if filename_analysis.get('document_type'):
                semantic_features.append(f"文档类型: {filename_analysis['document_type']}" if language == "Chinese" else f"Document type: {filename_analysis['document_type']}")
            
            if semantic_features:
                if language == "Chinese":
                    prompt_parts.append(f"文件名语义特征: {'; '.join(semantic_features)}")
                else:
                    prompt_parts.append(f"Filename Semantic Features: {'; '.join(semantic_features)}")
        
        # 添加内容主题信息
        if content_topics:
            topic_info = []
            for topic, score in content_topics:
                topic_info.append(f"{topic}({score:.2f})")
            
            if language == "Chinese":
                prompt_parts.append(f"内容主要主题: {', '.join(topic_info)}")
            else:
                prompt_parts.append(f"Content Main Topics: {', '.join(topic_info)}")
        
        # 添加上下文信息
        if context_analysis:
            directory_purpose = context_analysis.get('directory_purpose')
            if directory_purpose:
                if language == "Chinese":
                    prompt_parts.append(f"目录用途: {directory_purpose}")
                else:
                    prompt_parts.append(f"Directory Purpose: {directory_purpose}")
            
            related_files = context_analysis.get('related_files', [])
            if related_files:
                if language == "Chinese":
                    prompt_parts.append(f"相关文件: {', '.join([os.path.basename(f) for f in related_files[:3]])}")
                else:
                    prompt_parts.append(f"Related Files: {', '.join([os.path.basename(f) for f in related_files[:3]])}")
        
        # 添加用户偏好提示
        if preference_prediction:
            predicted_category = preference_prediction.get('category', '')
            prediction_confidence = preference_prediction.get('confidence', 0.0)
            if predicted_category and prediction_confidence > 0.5:
                if language == "Chinese":
                    prompt_parts.append(f"用户偏好预测: 倾向于分类为 '{predicted_category}' (置信度: {prediction_confidence:.2f})")
                else:
                    prompt_parts.append(f"User Preference Prediction: Tends to classify as '{predicted_category}' (confidence: {prediction_confidence:.2f})")
        
        # 添加智能分类策略
        if language == "Chinese":
            prompt_parts.append("""
智能分类策略：
1. **项目导向**: 识别项目相关文件，按项目生命周期分类
2. **工作流感知**: 理解文档在工作流中的位置和作用
3. **时效性考虑**: 区分临时文件、工作文件、归档文件
4. **协作模式**: 识别团队协作和版本管理模式
5. **专业领域**: 准确识别专业领域和行业特征
6. **用户习惯**: 优先考虑用户的历史分类偏好

分类要求：
- 避免过度细分，创建实用的分类体系
- 考虑用户的工作习惯和偏好
- 提供高置信度的分类建议
- 分类名称简洁明了，便于管理""")
        else:
            prompt_parts.append("""
Intelligent Classification Strategy:
1. **Project-Oriented**: Identify project-related files, classify by project lifecycle
2. **Workflow-Aware**: Understand document's position and role in workflows
3. **Time-Sensitive**: Distinguish between temporary files, working files, archived files
4. **Collaboration-Aware**: Identify team collaboration and version management patterns
5. **Domain-Specific**: Accurately identify professional domains and industry characteristics
6. **User-Habit-Aware**: Prioritize user's historical classification preferences

Classification Requirements:
- Avoid over-segmentation, create practical classification system
- Consider user's work habits and preferences
- Provide high-confidence classification suggestions
- Category names should be concise, clear, and manageable""")
        
        # 添加自定义提示词
        if custom_prompt:
            if language == "Chinese":
                prompt_parts.append(f"\n用户自定义要求: {custom_prompt}")
            else:
                prompt_parts.append(f"\nUser Custom Requirements: {custom_prompt}")
        
        return "\n".join(prompt_parts)
    
    def learn_from_result(self, analysis_result: Dict[str, Any], classification_result: Dict[str, Any]):
        """从分类结果中学习"""
        try:
            metadata = analysis_result.get('metadata', {})
            file_info = {
                'filename': metadata.get('basename', ''),
                'extension': metadata.get('extension', ''),
                'size': metadata.get('size', 0)
            }
            
            self.preference_learner.learn_from_classification(file_info, classification_result)
            
        except Exception as e:
            logger.error(f"学习分类结果失败: {str(e)}")
    
    def get_user_preferences_summary(self) -> Dict[str, Any]:
        """获取用户偏好摘要"""
        return self.preference_learner.get_user_preferences_summary()


# 导出主要类
__all__ = [
    'IntelligentFileAnalyzer',
    'FileMetadataExtractor', 
    'FilenameSemanticAnalyzer',
    'ContentTopicExtractor',
    'ContextAnalyzer',
    'UserPreferenceLearner'
]
