# ==========================================================================================
# 8. MODEL EVALUATION & METRICS
# ==========================================================================================

import numpy as np
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                             accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,
                             roc_auc_score, average_precision_score, matthews_corrcoef, balanced_accuracy_score)

y = np.random.randint(0, 2, size=100)
y_pred = np.random.rand(100)
y_prob = np.column_stack([1-y_pred, y_pred])

# ------------------------- REGRESSION METRICS --------------------------

# RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y, y_pred))
# sqrt(mean squared error). Sensitive to outliers.

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y, y_pred)
# Mean absolute difference. Robust to outliers.

# R2 (Coefficient of Determination)
r2 = r2_score(y, y_pred)
# 1 = perfect, 0 = mean prediction. Can be negative.

# MAPE (Mean Absolute Percentage Error)
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Not defined if y_true contains zero.

# ------------------------- CLASSIFICATION METRICS ----------------------

# Accuracy
acc = accuracy_score(y, np.round(y_pred))
# Proportion of correct labels.

# Precision
prec = precision_score(y, np.round(y_pred))
# TP/(TP+FP): Positive predictive value.

# Recall (Sensitivity)
rec = recall_score(y, np.round(y_pred))
# TP/(TP+FN): True positive rate.

# F1 Score
f1 = f1_score(y, np.round(y_pred))
# Harmonic mean of precision and recall.

# Balanced accuracy
bal_acc = balanced_accuracy_score(y, np.round(y_pred))
# Average of recall per class, corrects for imbalance.

# ROC-AUC
roc_auc = roc_auc_score(y, y_prob[:,1])
# Area under ROC curve; measures ability to rank positive > negative.

# PR-AUC
pr_auc = average_precision_score(y, y_prob[:,1])
# Area under Precision-Recall curve. Preferred for high imbalance.

# MCC (Matthews Correlation Coefficient)
mcc = matthews_corrcoef(y, np.round(y_pred))
# Correlation between predicted and observed. -1 (inverse), 0 (random), +1 (perfect).

# Sensitivity, Specificity
cm = confusion_matrix(y, np.round(y_pred))
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
# Sensitivity: recall for positive class. Specificity: recall for negative.

# ------------------------- METRIC PITFALLS ----------------------------

# Accuracy misleading with imbalance.
# ROC-AUC can be overoptimistic on imbalance.
# F1 ignores TNs.
# MAPE explodes at y=0.

# ==========================================================================================
# 9. STATISTICAL MODEL COMPARISON
# ==========================================================================================

from scipy import stats

# ------------------------------------------------------------------------------------------
# Cross-validation (theory)
# ------------------------------------------------------------------------------------------
# Purpose: Estimate generalization error; mitigate overfitting to specific data splits.

# ------------------------------------------------------------------------------------------
# Paired t-test on CV Scores
# ------------------------------------------------------------------------------------------
# WRONG: Comparing means without considering paired nature!
# CORRECT: For each fold, difference in scores across models; one-sample t-test on those differences.

def paired_t_test_cv(scores_model1, scores_model2):
    """Paired t-test for cross-validation model comparison."""
    diffs = np.array(scores_model1) - np.array(scores_model2)
    tstat, pval = stats.ttest_1samp(diffs, 0)
    return tstat, pval
# p < 0.05: Significant difference.

# ------------------------------------------------------------------------------------------
# 5x2cv paired t-test (Corrects for variance estimation bias. Dietterich, 1998)
# Not implemented in standard packages, formula:
# - Two models trained/tested 5 times on random 50/50 splits.
# - t statistic pools variance across all runs.

# ------------------------------------------------------------------------------------------
# Diebold–Mariano Test (time series forecast comparison)
# ------------------------------------------------------------------------------------------
# statsmodels example (not in sklearn)
from statsmodels.tsa.stattools import acf
def diebold_mariano_test(e1, e2, h=1, alternative='two-sided'):
    """
    e1, e2: Prediction errors for two models on same test set
    h: forecast horizon (usually 1)
    """
    d = e1 - e2
    d_mean = np.mean(d)
    d_var = np.var(d, ddof=1)
    n = len(d)
    dm_stat = d_mean / np.sqrt((d_var + 2 * np.sum([acf(d, nlags=k)[-1] for k in range(1, h)])) / n)
    pval = 2 * (1 - stats.norm.cdf(np.abs(dm_stat)))
    return dm_stat, pval

# ------------------------------------------------------------------------------------------
# Friedman test (for >2 models over multiple datasets or CV splits)
# ------------------------------------------------------------------------------------------
# Ranks models per split; tests if at least one model performs differently.
from scipy.stats import friedmanchisquare
# Example: scores1, scores2, scores3 = arrays of scores per fold
# friedmanchisquare(scores1, scores2, scores3)

# ------------------------------------------------------------------------------------------
# Nemenyi post-hoc test (pairwise model differences after Friedman)
# Not implemented in standard Python; exam: compute avg rank per model, compare with critical difference.

# ------------------------------------------------------------------------------------------
# Why naive comparison is invalid
# ------------------------------------------------------------------------------------------
# Naive: Compare averages, ignore dependence = inflated Type I error.
# Need to account for variance and fold-wise dependency ("variance dependency").
