"""
EA-RAG 向量数据库 - ChromaDB 实现

每个专家对应两个 Collection:
- {expert_id}_code: Code Layer
- {expert_id}_experience: Exp. Layer
"""

import chromadb
from chromadb.config import Settings
from typing import List, Dict, Optional, Tuple
from pathlib import Path

from .config import KNOWLEDGE_BASE_DIR, TOP_K_PER_EXPERT
from .models import Chunk


class VectorStore:
    """
    ChromaDB 向量存储
    
    目录结构:
    knowledge_base/
    └── chroma_db/     <- ChromaDB 持久化目录
    """
    
    def __init__(self, persist_dir: Optional[Path] = None):
        """
        初始化 ChromaDB
        
        Args:
            persist_dir: 持久化目录，默认为 knowledge_base/chroma_db
        """
        self.persist_dir = persist_dir or (KNOWLEDGE_BASE_DIR / "chroma_db")
        self.persist_dir.mkdir(parents=True, exist_ok=True)
        
        # 初始化 ChromaDB 客户端（持久化模式）
        self.client = chromadb.PersistentClient(
            path=str(self.persist_dir),
            settings=Settings(
                anonymized_telemetry=False
            )
        )
    
    def get_or_create_collection(self, name: str) -> chromadb.Collection:
        """获取或创建 Collection"""
        return self.client.get_or_create_collection(
            name=name,
            metadata={"hnsw:space": "cosine"}  # 使用余弦相似度
        )

    def delete_collection(self, name: str):
        try:
            self.client.delete_collection(name)
        except Exception:
            pass
    
    def add_chunks(
        self, 
        collection_name: str, 
        chunks: List[Chunk],
        embeddings: List[List[float]]
    ):
        """
        添加 chunks 到 Collection
        
        Args:
            collection_name: Collection 名称
            chunks: Chunk 列表
            embeddings: 对应的嵌入向量列表
        """
        if not chunks:
            return
        
        collection = self.get_or_create_collection(collection_name)
        
        # 准备数据
        ids = [f"{collection_name}_{i}" for i in range(len(chunks))]
        documents = [chunk.content for chunk in chunks]
        metadatas = [
            {
                "source": chunk.metadata.get("source", ""),
                "path": chunk.metadata.get("path", ""),
                "clause": chunk.metadata.get("clause", ""),
                "doc_type": chunk.doc_type,
                "expert_id": chunk.expert_id
            }
            for chunk in chunks
        ]
        
        # 添加到 ChromaDB
        collection.add(
            ids=ids,
            embeddings=embeddings,
            documents=documents,
            metadatas=metadatas
        )
    
    def search(
        self, 
        collection_name: str, 
        query_embedding: List[float],
        top_k: int = TOP_K_PER_EXPERT
    ) -> List[Tuple[Chunk, float]]:
        """
        搜索相似 chunks
        
        Args:
            collection_name: Collection 名称
            query_embedding: 查询向量
            top_k: 返回数量
            
        Returns:
            [(Chunk, score), ...] 按相似度降序排列
        """
        try:
            collection = self.client.get_collection(collection_name)
        except ValueError:
            return []  # Collection 不存在
        
        if collection.count() == 0:
            return []
        
        # ChromaDB 查询
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=min(top_k, collection.count()),
            include=["documents", "metadatas", "distances"]
        )
        
        # 转换为 Chunk 对象
        chunks_with_scores = []
        
        if results["documents"] and results["documents"][0]:
            for i, doc in enumerate(results["documents"][0]):
                metadata = results["metadatas"][0][i] if results["metadatas"] else {}
                distance = results["distances"][0][i] if results["distances"] else 0
                
                # ChromaDB 返回的是距离，余弦距离转相似度: similarity = 1 - distance
                similarity = 1 - distance
                
                chunk = Chunk(
                    content=doc,
                    metadata={
                        "source": metadata.get("source", ""),
                        "path": metadata.get("path", ""),
                        "clause": metadata.get("clause", "")
                    },
                    doc_type=metadata.get("doc_type", ""),
                    expert_id=metadata.get("expert_id", ""),
                    embedding=None  # 不需要存储在 Chunk 中了
                )
                chunks_with_scores.append((chunk, similarity))
        
        return chunks_with_scores

    def get_collection_count(self, collection_name: str) -> int:
        try:
            collection = self.client.get_collection(collection_name)
            return collection.count()
        except Exception:  # 改为捕获所有异常，或者 chromadb.errors.NotFoundError
            return 0
    
    def collection_exists(self, collection_name: str) -> bool:
        """检查 Collection 是否存在且有数据"""
        return self.get_collection_count(collection_name) > 0
    
    def list_collections(self) -> List[str]:
        """列出所有 Collections"""
        return [c.name for c in self.client.list_collections()]
    
    def clear_all(self):
        """清空所有数据"""
        for collection in self.client.list_collections():
            self.client.delete_collection(collection.name)
