# website/management/commands/index_docs.py
import os
import json
from django.core.management.base import BaseCommand
from django.conf import settings
from website.models import Post, Event, Document, FileUpload  # adapt to your models
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from django.db import transaction
from pathlib import Path

CHUNK_SIZE = 800  # characters per chunk
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
INDEX_PATH = getattr(settings, "FAISS_INDEX_PATH", None) or "faiss_index.bin"
DOCS_JSON = getattr(settings, "DOCS_JSON_PATH", None) or "doc_texts.json"

def chunk_text(text, size=CHUNK_SIZE):
    text = text.strip()
    chunks = []
    i = 0
    while i < len(text):
        chunks.append(text[i:i+size])
        i += size
    return chunks

class Command(BaseCommand):
    help = "Index Posts, Events, Files into Document model and build FAISS embeddings."

    def handle(self, *args, **options):
        model = SentenceTransformer(EMBED_MODEL_NAME)
        docs = []

        # collect from Posts
        for p in Post.objects.all():
            text = f"{p.title}\n\n{p.content}"
            chunks = chunk_text(text)
            for idx, c in enumerate(chunks):
                docs.append({"source": "post", "source_id": str(p.id), "title": p.title, "text": c, "chunk_index": idx})

        # collect from Events
        for e in Event.objects.all():
            text = f"{e.title}\n\n{e.description}"
            chunks = chunk_text(text)
            for idx, c in enumerate(chunks):
                docs.append({"source": "event", "source_id": str(e.id), "title": e.title, "text": c, "chunk_index": idx})

        # collect from FileUpload (plain text or small PDF conversion)
        for f in FileUpload.objects.all():
            # naive: if file saved as text. For pdf need extraction libs (pdfminer/textract)
            try:
                fp = f.file.path
                with open(fp, "r", encoding="utf-8") as fh:
                    txt = fh.read()
                chunks = chunk_text(txt)
                for idx, c in enumerate(chunks):
                    docs.append({"source": "file", "source_id": str(f.id), "title": f.filename or "", "text": c, "chunk_index": idx})
            except Exception:
                self.stdout.write(self.style.WARNING(f"Could not read file {f} — skip or add PDF parser."))

        if not docs:
            self.stdout.write(self.style.WARNING("No docs found to index."))
            return

        texts = [d["text"] for d in docs]
        self.stdout.write(f"Encoding {len(texts)} doc chunks with {EMBED_MODEL_NAME} ...")
        embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
        # normalize for cosine using inner product
        faiss.normalize_L2(embeddings)
        dim = embeddings.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(embeddings)
        faiss.write_index(index, INDEX_PATH)
        self.stdout.write(self.style.SUCCESS(f"FAISS index written to {INDEX_PATH}"))

        # store doc metadata JSON and optionally Document objects
        with open(DOCS_JSON, "w", encoding="utf-8") as out:
            json.dump(docs, out, ensure_ascii=False, indent=2)
        self.stdout.write(self.style.SUCCESS(f"Document metadata written to {DOCS_JSON}"))

        # Optional: persist Document rows for traceability
        with transaction.atomic():
            Document.objects.all().delete()  # optional: clear and re-add
            for d in docs:
                Document.objects.create(
                    source=d["source"],
                    source_id=d["source_id"],
                    title=d.get("title", "")[:400],
                    text=d["text"],
                    chunk_index=d["chunk_index"]
                )
        self.stdout.write(self.style.SUCCESS("Document objects stored in DB."))
