"""
Simplified ML and NLP implementations — numpy only.

Interfaces mirror scikit-learn:
  Vectorisers : .fit(corpus)  .transform(corpus)  .fit_transform(corpus)
  Classifiers : .fit(X, y)   .predict(X)
  KMeans      : .fit(X)      .predict(X)          .labels_
"""

import math
import random
from collections import Counter

import numpy as np


# ─── Helpers ─────────────────────────────────────────────────────────────────

def tokenize(text: str) -> list:
    return text.lower().split()


def train_test_split(X, y, test_size=0.2, seed=42):
    idx = list(range(len(y)))
    random.Random(seed).shuffle(idx)
    cut = int(len(idx) * (1 - test_size))
    tr, te = idx[:cut], idx[cut:]
    return X[tr], X[te], [y[i] for i in tr], [y[i] for i in te]


# ─── Metrics ─────────────────────────────────────────────────────────────────

def accuracy(y_true, y_pred) -> float:
    return sum(a == b for a, b in zip(y_true, y_pred)) / len(y_true)


def classification_report(y_true, y_pred) -> dict:
    labels = sorted(set(y_true))
    ps, rs, fs = [], [], []
    for lbl in labels:
        tp = sum(1 for a, b in zip(y_true, y_pred) if a == lbl and b == lbl)
        fp = sum(1 for a, b in zip(y_true, y_pred) if a != lbl and b == lbl)
        fn = sum(1 for a, b in zip(y_true, y_pred) if a == lbl and b != lbl)
        p = tp / (tp + fp) if (tp + fp) else 0.0
        r = tp / (tp + fn) if (tp + fn) else 0.0
        f = 2 * p * r / (p + r) if (p + r) else 0.0
        ps.append(p); rs.append(r); fs.append(f)
    return {
        "accuracy":  accuracy(y_true, y_pred),
        "precision": sum(ps) / len(ps),
        "recall":    sum(rs) / len(rs),
        "f1":        sum(fs) / len(fs),
    }


# ─── NLP Vectorisers ─────────────────────────────────────────────────────────

class BagOfWords:
    """Word count matrix."""

    def fit(self, corpus):
        vocab = {w for doc in corpus for w in tokenize(doc)}
        self.vocab = sorted(vocab)
        self.word2idx = {w: i for i, w in enumerate(self.vocab)}
        return self

    def transform(self, corpus):
        X = np.zeros((len(corpus), len(self.vocab)))
        for i, doc in enumerate(corpus):
            for w in tokenize(doc):
                if w in self.word2idx:
                    X[i, self.word2idx[w]] += 1
        return X

    def fit_transform(self, corpus):
        return self.fit(corpus).transform(corpus)


class TFIDFVectorizer:
    """TF-IDF with smooth IDF and L2 normalisation."""

    def fit(self, corpus):
        tok = [tokenize(d) for d in corpus]
        vocab = {w for tokens in tok for w in tokens}
        self.vocab = sorted(vocab)
        self.word2idx = {w: i for i, w in enumerate(self.vocab)}
        N = len(corpus)
        df = Counter(w for tokens in tok for w in set(tokens))
        self.idf = {w: math.log((N + 1) / (df[w] + 1)) + 1 for w in self.vocab}
        return self

    def transform(self, corpus):
        X = np.zeros((len(corpus), len(self.vocab)))
        for i, doc in enumerate(corpus):
            tokens = tokenize(doc)
            if not tokens:
                continue
            tf = Counter(tokens)
            for w, cnt in tf.items():
                if w in self.word2idx:
                    X[i, self.word2idx[w]] = (cnt / len(tokens)) * self.idf[w]
        norms = np.linalg.norm(X, axis=1, keepdims=True)
        X /= np.where(norms == 0, 1, norms)
        return X

    def fit_transform(self, corpus):
        return self.fit(corpus).transform(corpus)


class NGramVectorizer:
    """Word n-gram count matrix."""

    def __init__(self, n=2):
        self.n = n

    def _ngrams(self, tokens):
        return [" ".join(tokens[i:i + self.n]) for i in range(len(tokens) - self.n + 1)]

    def fit(self, corpus):
        vocab = {g for doc in corpus for g in self._ngrams(tokenize(doc))}
        self.vocab = sorted(vocab)
        self.gram2idx = {g: i for i, g in enumerate(self.vocab)}
        return self

    def transform(self, corpus):
        X = np.zeros((len(corpus), len(self.vocab)))
        for i, doc in enumerate(corpus):
            for g in self._ngrams(tokenize(doc)):
                if g in self.gram2idx:
                    X[i, self.gram2idx[g]] += 1
        return X

    def fit_transform(self, corpus):
        return self.fit(corpus).transform(corpus)


class OneHotEncoder:
    """One-hot encode a sequence of categorical labels."""

    def fit(self, labels):
        self.classes_ = sorted(set(labels))
        self.class2idx = {c: i for i, c in enumerate(self.classes_)}
        return self

    def transform(self, labels):
        X = np.zeros((len(labels), len(self.classes_)))
        for i, lbl in enumerate(labels):
            X[i, self.class2idx[lbl]] = 1
        return X

    def fit_transform(self, labels):
        return self.fit(labels).transform(labels)

    def inverse_transform(self, X):
        return [self.classes_[int(np.argmax(row))] for row in X]


# ─── Classifiers ─────────────────────────────────────────────────────────────

class LogisticRegression:
    def __init__(self, lr=0.5, epochs=300):
        self.lr = lr
        self.epochs = epochs

    def _softmax(self, z):
        z = z - z.max(axis=1, keepdims=True)
        e = np.exp(z)
        return e / e.sum(axis=1, keepdims=True)

    def fit(self, X, y):
        self.classes_ = sorted(set(y))
        y_idx = np.array([self.classes_.index(c) for c in y])
        n, d = X.shape
        K = len(self.classes_)
        self.W = np.zeros((d, K))
        self.b = np.zeros(K)
        for _ in range(self.epochs):
            probs = self._softmax(X @ self.W + self.b)
            oh = np.zeros_like(probs)
            oh[np.arange(n), y_idx] = 1
            self.W -= self.lr * (X.T @ (probs - oh)) / n
            self.b -= self.lr * (probs - oh).mean(axis=0)
        return self

    def predict(self, X):
        idx = np.argmax(self._softmax(X @ self.W + self.b), axis=1)
        return [self.classes_[i] for i in idx]


class NaiveBayes:
    """Multinomial Naive Bayes with Laplace smoothing."""

    def fit(self, X, y):
        self.classes_ = sorted(set(y))
        n = len(y)
        self.log_prior = {}
        self.log_likelihood = {}
        for cls in self.classes_:
            mask = np.array([yi == cls for yi in y])
            X_cls = X[mask]
            self.log_prior[cls] = math.log(mask.sum() / n)
            counts = X_cls.sum(axis=0) + 1
            self.log_likelihood[cls] = np.log(counts / counts.sum())
        return self

    def predict(self, X):
        return [
            max(self.classes_, key=lambda c: self.log_prior[c] + self.log_likelihood[c] @ x)
            for x in X
        ]


class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = list(y)
        return self

    def predict(self, X):
        preds = []
        for x in X:
            dists = np.linalg.norm(self.X_train - x, axis=1)
            nn = [self.y_train[i] for i in np.argsort(dists)[: self.k]]
            preds.append(Counter(nn).most_common(1)[0][0])
        return preds


class SVM:
    """One-vs-rest linear SVM via sub-gradient descent (hinge loss)."""

    def __init__(self, lr=0.01, epochs=200, C=1.0):
        self.lr = lr
        self.epochs = epochs
        self.C = C

    def _fit_binary(self, X, y_bin):
        n, d = X.shape
        w, b = np.zeros(d), 0.0
        for ep in range(self.epochs):
            lr = self.lr / (1 + ep * 0.01)
            sv = y_bin * (X @ w + b) < 1
            w -= lr * (w - self.C * (X[sv] * y_bin[sv, None]).sum(axis=0))
            b += lr * self.C * y_bin[sv].sum()
        return w, b

    def fit(self, X, y):
        self.classes_ = sorted(set(y))
        y_arr = np.array(y)
        self.classifiers = {
            cls: self._fit_binary(X, np.where(y_arr == cls, 1.0, -1.0))
            for cls in self.classes_
        }
        return self

    def predict(self, X):
        scores = {cls: X @ w + b for cls, (w, b) in self.classifiers.items()}
        return [max(self.classes_, key=lambda c: float(scores[c][i])) for i in range(len(X))]


class DecisionTree:
    """CART decision tree using Gini impurity."""

    def __init__(self, max_depth=4, min_samples=2):
        self.max_depth = max_depth
        self.min_samples = min_samples

    def _gini(self, y):
        n = len(y)
        return 1 - sum((c / n) ** 2 for c in Counter(y).values())

    def _best_split(self, X, y):
        base, n = self._gini(y), len(y)
        best = (-1.0, None, None)
        for feat in range(X.shape[1]):
            for thresh in np.unique(X[:, feat]):
                l = X[:, feat] <= thresh
                if l.sum() == 0 or (~l).sum() == 0:
                    continue
                gain = base - (l.sum() * self._gini(y[l]) + (~l).sum() * self._gini(y[~l])) / n
                if gain > best[0]:
                    best = (gain, feat, thresh)
        return best[1], best[2]

    def _build(self, X, y, depth):
        if depth == 0 or len(y) < self.min_samples or len(set(y)) == 1:
            return Counter(y).most_common(1)[0][0]
        feat, thresh = self._best_split(X, y)
        if feat is None:
            return Counter(y).most_common(1)[0][0]
        l = X[:, feat] <= thresh
        return {
            "f": feat, "t": thresh,
            "l": self._build(X[l],  y[l],  depth - 1),
            "r": self._build(X[~l], y[~l], depth - 1),
        }

    def fit(self, X, y):
        self.tree_ = self._build(X, np.array(y), self.max_depth)
        return self

    def _walk(self, node, x):
        if not isinstance(node, dict):
            return node
        return self._walk(node["l"] if x[node["f"]] <= node["t"] else node["r"], x)

    def predict(self, X):
        return [self._walk(self.tree_, x) for x in X]


class RandomForest:
    """Ensemble of DecisionTrees with bootstrap sampling and random feature subsets."""

    def __init__(self, n_trees=10, max_depth=4, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features

    def fit(self, X, y):
        n, d = X.shape
        mf = self.max_features or max(1, int(d ** 0.5))
        y = np.array(y)
        self.estimators_ = []
        for _ in range(self.n_trees):
            rows = np.random.choice(n, n, replace=True)
            cols = np.random.choice(d, mf, replace=False)
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X[np.ix_(rows, cols)], y[rows])
            self.estimators_.append((tree, cols))
        return self

    def predict(self, X):
        all_preds = [tree.predict(X[:, cols]) for tree, cols in self.estimators_]
        return [Counter(row).most_common(1)[0][0] for row in zip(*all_preds)]


# ─── Clustering ───────────────────────────────────────────────────────────────

class KMeans:
    def __init__(self, k=2, max_iter=100, seed=42):
        self.k = k
        self.max_iter = max_iter
        self.seed = seed

    def _assign(self, X):
        dists = np.linalg.norm(X[:, None] - self.centroids_[None], axis=2)
        return np.argmin(dists, axis=1)

    def fit(self, X):
        rng = np.random.default_rng(self.seed)
        self.centroids_ = X[rng.choice(len(X), self.k, replace=False)].copy()
        for _ in range(self.max_iter):
            labels = self._assign(X)
            new_c = np.array([
                X[labels == i].mean(axis=0) if (labels == i).any() else self.centroids_[i]
                for i in range(self.k)
            ])
            if np.allclose(new_c, self.centroids_):
                break
            self.centroids_ = new_c
        self.labels_ = self._assign(X)
        return self

    def predict(self, X):
        return self._assign(X)


# ─── Demo ────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    np.random.seed(0)

    sentences = [
        ("I love this product it is amazing",           "positive"),
        ("What a wonderful experience today",            "positive"),
        ("This is the best thing I have ever used",      "positive"),
        ("Absolutely fantastic results really happy",    "positive"),
        ("Great quality and excellent service",          "positive"),
        ("Really enjoyed this highly recommend",         "positive"),
        ("Outstanding performance very impressed",       "positive"),
        ("Brilliant work exceeded my expectations",      "positive"),
        ("Very satisfied with the outcome",              "positive"),
        ("Superb quality love every bit of it",          "positive"),
        ("Incredible value for money thumbs up",         "positive"),
        ("Wonderful product works perfectly well",       "positive"),
        ("This is terrible and very disappointing",      "negative"),
        ("Worst experience I have ever had",             "negative"),
        ("Absolutely awful do not recommend",            "negative"),
        ("Poor quality broken on arrival",               "negative"),
        ("Very bad product total waste of money",        "negative"),
        ("Horrible service never buying again",          "negative"),
        ("Dreadful quality fell apart quickly",          "negative"),
        ("Terrible experience really frustrated",        "negative"),
        ("Completely useless returned immediately",      "negative"),
        ("Awful product cheap and flimsy",               "negative"),
        ("Disappointing results not worth it",           "negative"),
        ("Very poor quality do not buy this",            "negative"),
    ]

    corpus = [s for s, _ in sentences]
    labels = [l for _, l in sentences]

    # ── Vectorise ──────────────────────────────────────────────────────────────
    print("=" * 62)
    print("Vectorisers")
    print("=" * 62)

    tfidf = TFIDFVectorizer()
    X = tfidf.fit_transform(corpus)
    X_tr, X_te, y_tr, y_te = train_test_split(X, labels, test_size=0.25)
    print(f"TF-IDF     vocab={len(tfidf.vocab)}  shape={X.shape}  "
          f"train/test={len(y_tr)}/{len(y_te)}")

    bow_X = BagOfWords().fit_transform(corpus)
    print(f"BoW        shape={bow_X.shape}")

    bi_X = NGramVectorizer(n=2).fit_transform(corpus)
    print(f"Bigrams    shape={bi_X.shape}")

    ohe = OneHotEncoder()
    ohe_X = ohe.fit_transform(labels)
    print(f"OneHot     shape={ohe_X.shape}  "
          f"inverse sample={ohe.inverse_transform(ohe_X[:3])}")

    # ── Classifiers ────────────────────────────────────────────────────────────
    classifiers = [
        ("Logistic Regression", LogisticRegression(lr=0.5, epochs=300)),
        ("Naive Bayes",         NaiveBayes()),
        ("KNN (k=3)",           KNN(k=3)),
        ("SVM",                 SVM(lr=0.01, epochs=200, C=1.0)),
        ("Decision Tree",       DecisionTree(max_depth=4)),
        ("Random Forest",       RandomForest(n_trees=10, max_depth=4)),
    ]

    print()
    print("=" * 62)
    print("Classifiers (trained on TF-IDF vectors)")
    print("=" * 62)
    print(f"{'Model':<22}  {'Acc':>5}  {'P':>5}  {'R':>5}  {'F1':>5}")
    print("-" * 62)
    for name, clf in classifiers:
        clf.fit(X_tr, y_tr)
        preds = clf.predict(X_te)
        m = classification_report(y_te, preds)
        print(f"{name:<22}  {m['accuracy']:.3f}  {m['precision']:.3f}"
              f"  {m['recall']:.3f}  {m['f1']:.3f}")

    # ── K-Means ────────────────────────────────────────────────────────────────
    print()
    print("=" * 62)
    print("K-Means clustering (k=2, unsupervised)")
    print("=" * 62)
    km = KMeans(k=2)
    km.fit(X)
    for cid in range(km.k):
        true_lbls = [labels[i] for i in range(len(labels)) if km.labels_[i] == cid]
        print(f"  Cluster {cid}: {Counter(true_lbls)}")
