"""
ml_engine.py
-----------------------
Core Machine Learning & AI engine for the TMA Assistant (Gemini-like Chatbot).

Features:
- Contextual response generation (uses retrieval + reasoning)
- Semantic document search
- Intent classification
- Feedback-based continual learning
- Automatic retraining from conversation logs
"""

import os
import json
import numpy as np
import datetime
from typing import List, Dict, Any
import spacy


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

from django.conf import settings
from .models import Document, ConversationLog, AssistantFeedback

nlp = spacy.load("en_core_web_sm")

# -----------------------------
# Global Paths & Variables
# -----------------------------
MODEL_DIR = os.path.join(settings.BASE_DIR, "ml_models")
os.makedirs(MODEL_DIR, exist_ok=True)

INTENT_MODEL_PATH = os.path.join(MODEL_DIR, "intent_classifier.pkl")
VECTORIZER_PATH = os.path.join(MODEL_DIR, "vectorizer.pkl")


# -----------------------------
# Utility: Basic Text Cleaner
# -----------------------------
def clean_text(text: str) -> str:
    import re
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()


# -----------------------------
# 1. Semantic Document Retrieval
# -----------------------------
def retrieve_relevant_documents(query: str, top_k: int = 3) -> List[Dict[str, Any]]:
    """
    Retrieves the most semantically similar documents to the given query
    from the Document model using TF-IDF cosine similarity.
    """
    query = clean_text(query)
    documents = Document.objects.all()

    if not documents.exists():
        return []

    texts = [clean_text(doc.content) for doc in documents]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts + [query])
    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    ranked_indices = similarities.argsort()[::-1][:top_k]
    results = []
    for idx in ranked_indices:
        doc = documents[idx]
        results.append({
            "id": doc.id,
            "title": doc.title,
            "similarity": float(similarities[idx]),
            "content": doc.content,
        })
    return results


# -----------------------------
# 2. Intent Classification
# -----------------------------
def train_intent_classifier():
    """
    Train or retrain a Naive Bayes intent classifier using stored conversation logs.
    This can be triggered automatically on feedback or on schedule.
    """
    logs = ConversationLog.objects.exclude(intent=None)
    if not logs.exists():
        print("[ML] No data found for intent classification.")
        return None

    X = [clean_text(log.user_message) for log in logs]
    y = [log.intent for log in logs]

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    pipeline = Pipeline([
        ("vectorizer", TfidfVectorizer()),
        ("classifier", MultinomialNB())
    ])

    pipeline.fit(X_train, y_train)
    acc = pipeline.score(X_test, y_test)
    print(f"[ML] Intent classifier retrained with accuracy: {acc:.2f}")

    joblib.dump(pipeline, INTENT_MODEL_PATH)
    joblib.dump(le, VECTORIZER_PATH)
    return pipeline


def load_intent_classifier():
    """
    Loads the saved intent classification model if available.
    """
    if os.path.exists(INTENT_MODEL_PATH) and os.path.exists(VECTORIZER_PATH):
        try:
            pipeline = joblib.load(INTENT_MODEL_PATH)
            le = joblib.load(VECTORIZER_PATH)
            return pipeline, le
        except Exception as e:
            print(f"[ML] Error loading intent model: {e}")
    return train_intent_classifier(), None


def predict_intent(user_message: str):
    """
    Predicts intent of a message using trained classifier.
    """
    model, label_encoder = load_intent_classifier()
    if not model or not label_encoder:
        return "general_query", 0.0

    cleaned = clean_text(user_message)
    proba = model.predict_proba([cleaned])[0]
    pred_idx = np.argmax(proba)
    confidence = float(proba[pred_idx])
    intent = label_encoder.inverse_transform([pred_idx])[0]
    return intent, confidence


# -----------------------------
# 3. AI Response Generation
# -----------------------------
def generate_response(user_message: str) -> Dict[str, Any]:
    """
    Core AI pipeline:
    - Classify intent
    - Retrieve relevant context documents
    - Generate response using hybrid retrieval + rule-based template
    """
    intent, confidence = predict_intent(user_message)
    context_docs = retrieve_relevant_documents(user_message, top_k=2)

    # Simple rule-based templates (can integrate GPT later)
    if intent == "greeting":
        answer = "Hello! How can I assist you today?"
    elif intent == "training_inquiry":
        answer = "Our training programs cover Full Stack, AI, and Cloud. Would you like details?"
    elif intent == "event_info":
        answer = "Upcoming TMA events include tech workshops and ambassador meetups."
    else:
        # Use context from retrieved docs
        if context_docs:
            joined_context = " ".join([doc["content"] for doc in context_docs])
            answer = f"I found some information related to your query: {joined_context[:300]}..."
        else:
            answer = "I’ll check my sources and get back with accurate details."

    # Save to conversation logs
    log = ConversationLog.objects.create(
        user_message=user_message,
        ai_response=answer,
        intent=intent,
        confidence=confidence,
        referenced_sources=[doc["id"] for doc in context_docs],
        timestamp=datetime.datetime.now(),
    )
    return {
        "response": answer,
        "intent": intent,
        "confidence": confidence,
        "sources": context_docs,
        "log_id": log.id
    }


# -----------------------------
# 4. Feedback Learning
# -----------------------------
def process_feedback(log_id: int, feedback: str):
    """
    Processes user feedback ('good'/'bad') and triggers retraining if needed.
    """
    log = ConversationLog.objects.filter(id=log_id).first()
    if not log:
        return {"status": "error", "message": "Conversation log not found."}

    AssistantFeedback.objects.create(
        log=log,
        feedback=feedback,
        timestamp=datetime.datetime.now()
    )

    if feedback.lower() == "bad":
        # Add to dataset for retraining
        log.intent = "uncertain"
        log.save()
        retrain_intent_model()
    return {"status": "success", "message": "Feedback recorded."}


# -----------------------------
# 5. Automatic Retraining
# -----------------------------
def retrain_intent_model(threshold: int = 5):
    """
    Retrains model automatically after receiving a threshold number of feedback items.
    """
    bad_feedback_count = AssistantFeedback.objects.filter(feedback__iexact="bad").count()
    if bad_feedback_count >= threshold:
        print("[ML] Threshold reached. Retraining intent classifier...")
        train_intent_classifier()
        AssistantFeedback.objects.all().delete()  # reset feedback after training


# -----------------------------
# 6. On-demand Search / QA
# -----------------------------
def semantic_search(query: str) -> List[Dict[str, Any]]:
    """
    Provides a semantic search endpoint (used in /api/ml-search/).
    """
    return retrieve_relevant_documents(query, top_k=5)
