"""Build ChromaDB (semantic) and BM25 (keyword) indices from .md corpus.

Uses sentence-transformers for embeddings (CPU, no Ollama).
Sanitizes chunks at ingestion time via prompt filter.
"""

import json
import os
import pickle
from pathlib import Path
from typing import List, Tuple

import chromadb
from chromadb.config import Settings
from rank_bm25 import BM25Okapi
import yaml

from .stemmer import tokenize_and_stem
from .embeddings import get_embeddings_batch
from utils.errors import CorpusEmptyError
from utils.sanitizer import sanitize_chunk


def load_config(config_path: str = "config.yaml") -> dict:
    with open(config_path) as f:
        return yaml.safe_load(f)


def load_corpus(corpus_path: str, extensions: List[str]) -> List[Tuple[str, str]]:
    """Load all matching files from corpus directory."""
    docs = []
    corpus_dir = Path(corpus_path)

    if not corpus_dir.exists():
        raise CorpusEmptyError(f"Corpus directory does not exist: {corpus_path}")

    for ext in extensions:
        for file_path in corpus_dir.rglob(f"*{ext}"):
            try:
                content = file_path.read_text(encoding="utf-8")
                docs.append((str(file_path), content))
                print(f"  + {file_path.name} ({len(content):,} chars)")
            except Exception as e:
                print(f"  ! {file_path}: {e}")

    if not docs:
        raise CorpusEmptyError(
            f"No documents with extensions {extensions} found in {corpus_path}"
        )

    return docs


def chunk_document(content: str, chunk_size: int, overlap: int) -> List[str]:
    """Sliding window chunking with sentence-boundary snapping."""
    if overlap >= chunk_size:
        overlap = max(0, chunk_size - 1)

    chunks = []
    start = 0

    while start < len(content):
        end = start + chunk_size
        chunk = content[start:end]

        if end < len(content):
            last_period = chunk.rfind(". ")
            if last_period > chunk_size // 2:
                end = start + last_period + 2
                chunk = content[start:end]

        chunks.append(chunk.strip())
        start = end - overlap

    return [c for c in chunks if len(c) > 50]


def build_indices(config_path: str = "config.yaml"):
    """Main indexing function — builds ChromaDB + BM25 with batch embeddings."""
    cfg = load_config(config_path)

    corpus_path = cfg["corpus"]["path"]
    extensions = cfg["corpus"]["extensions"]
    chroma_path = cfg["indexing"]["chroma_path"]
    bm25_path = cfg["indexing"]["bm25_path"]
    collection_name = cfg["indexing"]["collection_name"]
    chunk_size = cfg["indexing"]["chunk_size"]
    chunk_overlap = cfg["indexing"]["chunk_overlap"]
    batch_size = cfg["indexing"].get("batch_size", 50)

    # 1. Load corpus
    print(f"\n[1/4] Loading corpus from {corpus_path}...")
    docs = load_corpus(corpus_path, extensions)
    print(f"  = {len(docs)} documents loaded\n")

    # 2. Chunk and sanitize
    print("[2/4] Chunking and sanitizing documents...")
    all_chunks: List[str] = []
    chunk_metadata: List[dict] = []

    for file_path, content in docs:
        chunks = chunk_document(content, chunk_size, chunk_overlap)
        for i, chunk in enumerate(chunks):
            sanitized = sanitize_chunk(chunk, config_path)
            all_chunks.append(sanitized)
            chunk_metadata.append({
                "source": file_path,
                "chunk_id": i,
                "stem_tags": json.dumps(tokenize_and_stem(sanitized)[:10])
            })

    print(f"  = {len(all_chunks)} chunks created\n")

    # 3. Build ChromaDB with batch sentence-transformers embeddings
    print("[3/4] Building ChromaDB (sentence-transformers, CPU)...")
    os.makedirs(chroma_path, exist_ok=True)

    client = chromadb.PersistentClient(
        path=chroma_path,
        settings=Settings(anonymized_telemetry=False)
    )

    try:
        client.delete_collection(collection_name)
    except Exception:
        pass

    collection = client.create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}
    )

    # Batch embed and insert
    for i in range(0, len(all_chunks), batch_size):
        batch_end = min(i + batch_size, len(all_chunks))
        batch_texts = all_chunks[i:batch_end]
        batch_meta = chunk_metadata[i:batch_end]
        batch_ids = [f"chunk_{j}" for j in range(i, batch_end)]

        embeddings = get_embeddings_batch(batch_texts, config_path)

        collection.add(
            ids=batch_ids,
            embeddings=embeddings,
            documents=batch_texts,
            metadatas=batch_meta
        )
        print(f"  Indexed {batch_end}/{len(all_chunks)} chunks", end="\r")

    print(f"\n  = ChromaDB index at {chroma_path}\n")

    # 4. Build BM25
    print("[4/4] Building BM25 keyword index...")
    tokenized_corpus = [tokenize_and_stem(chunk) for chunk in all_chunks]
    bm25 = BM25Okapi(tokenized_corpus)

    os.makedirs(os.path.dirname(bm25_path), exist_ok=True)
    with open(bm25_path, "wb") as f:
        pickle.dump({
            "bm25": bm25,
            "chunks": all_chunks,
            "metadata": chunk_metadata
        }, f)

    print(f"  = BM25 index at {bm25_path}\n")
    print("=" * 60)
    print("INDEX BUILD COMPLETE")
    print(f"  Documents:  {len(docs)}")
    print(f"  Chunks:     {len(all_chunks)}")
    print(f"  ChromaDB:   {chroma_path}")
    print(f"  BM25:       {bm25_path}")
    print("=" * 60)


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="SafeClaw corpus indexer")
    parser.add_argument("--config", default="config.yaml", help="Config file path")
    parser.add_argument("--rebuild", action="store_true", help="Force rebuild indices")
    args = parser.parse_args()
    build_indices(args.config)
