import numpy as np
import csv
import os
import re
from sklearn.metrics.pairwise import cosine_similarity
from app.ml.model_loader import get_model_components

# Load tortured phrases mapping from CSV
TORTURED_PHRASES_MAP = {}

def load_tortured_phrases():
    global TORTURED_PHRASES_MAP
    if TORTURED_PHRASES_MAP:
        return TORTURED_PHRASES_MAP

    csv_path = os.path.join(os.path.dirname(__file__), "../../data/fingerprints.csv")
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            fingerprint = row["Fingerprint - Tortured Phrase"]
            expected = row["Expected Text"]

            # Extract individual phrases from fingerprint (remove AND, quotes)
            # e.g., '"surface region" AND "surface area"' -> ['surface region', 'surface area']
            phrases = re.findall(r'"([^"]+)"', fingerprint)
            for phrase in phrases:
                phrase_lower = phrase.lower().strip()
                if phrase_lower and len(phrase_lower) > 2:
                    TORTURED_PHRASES_MAP[phrase_lower] = expected

    return TORTURED_PHRASES_MAP

def detect_tortured_phrases(candidate_phrases):
    """
    Detect tortured phrases using EXACT string matching only (100% accuracy)
    Returns list of dicts with text, start, end, matched_with, confidence
    """
    # Load the tortured phrases mapping
    tortured_map = load_tortured_phrases()

    detections = []
    for phrase_data in candidate_phrases:
        phrase_text = phrase_data["text"].lower().strip()

        # Exact match only
        if phrase_text in tortured_map:
            detections.append({
                "phrase": phrase_data["text"],
                "start": phrase_data["start"],
                "end": phrase_data["end"],
                "matched_with": tortured_map[phrase_text],
                "confidence": 1.0  # 100% - exact match
            })

    # Deduplicate overlapping phrases - prioritize longer/complete matches
    # First, filter out partial matches where a longer phrase exists
    filtered = []
    for det in detections:
        is_subphrase = False
        for other in detections:
            if det is other:
                continue
            # Check if current detection is contained within another detection
            # (same matched_with target but shorter span, or overlapping with longer phrase)
            if (det["start"] >= other["start"] and det["end"] <= other["end"] and
                len(det["phrase"]) < len(other["phrase"])):
                is_subphrase = True
                break
            # Also check if this is a partial word match within a longer expected phrase
            if det["matched_with"] != other["matched_with"]:
                # Check if det's matched phrase is a substring of other's matched phrase
                if det["matched_with"] in other["matched_with"]:
                    # And they overlap in the text
                    if not (det["end"] <= other["start"] or det["start"] >= other["end"]):
                        is_subphrase = True
                        break
        if not is_subphrase:
            filtered.append(det)

    # Then remove remaining overlaps, keeping higher confidence matches
    filtered = sorted(filtered, key=lambda x: (-x["confidence"], x["start"]))
    final = []
    for det in filtered:
        overlaps = False
        for kept in final:
            # Check for any overlap
            if not (det["end"] <= kept["start"] or det["start"] >= kept["end"]):
                overlaps = True
                break
        if not overlaps:
            final.append(det)

    return sorted(final, key=lambda x: x["start"])
