Source code for scitex_stats.tests.parametric._test_anova

#!/usr/bin/env python3
# Timestamp: "2025-10-01 16:30:00 (ywatanabe)"
# File: scitex_stats/tests/parametric/_test_anova.py
# ----------------------------------------
from __future__ import annotations

r"""
Functionalities:
  - Perform one-way ANOVA for independent samples
  - Compute eta-squared effect size
  - Generate box plots and distribution visualizations
  - Support flexible output formats (dict or DataFrame)
  - Automatic normality and homogeneity checking

Dependencies:
  - packages: numpy, pandas, scipy, matplotlib

IO:
  - input: Multiple independent samples (arrays or Series)
  - output: Test results (dict or DataFrame) and optional figure
"""

"""Imports"""
import os  # noqa: E402
from typing import List, Literal, Optional, Union  # noqa: E402

import matplotlib.axes  # noqa: E402
import numpy as np  # noqa: E402
import pandas as pd  # noqa: E402
from scipy import stats  # noqa: E402

import matplotlib.pyplot as _mpl_plt  # noqa: E402
from scitex_stats._logging import getLogger
from scitex_stats._utils._formatters import fmt_stat, fmt_sym  # noqa: E402

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)

logger = getLogger(__name__)

"""Functions"""


[docs] def test_anova( # noqa: C901 groups: Optional[List[Union[np.ndarray, pd.Series]]] = None, var_names: Optional[List[str]] = None, alpha: float = 0.05, check_assumptions: bool = True, plot: bool = False, ax: Optional[matplotlib.axes.Axes] = None, data: Union[pd.DataFrame, str, None] = None, value_col: Optional[str] = None, group_col: Optional[str] = None, return_as: Literal["dict", "dataframe"] = "dict", decimals: int = 3, verbose: bool = False, ) -> Union[dict, pd.DataFrame]: r""" Perform one-way ANOVA for independent samples. Parameters ---------- groups : list of arrays List of sample arrays for each group (minimum 2 groups) var_names : list of str, optional Names for each group. If None, uses 'Group 1', 'Group 2', etc. alpha : float, default 0.05 Significance level check_assumptions : bool, default True Whether to check normality and homogeneity assumptions plot : bool, default False Whether to generate visualization ax : matplotlib.axes.Axes, optional Axes object to plot on. If None and plot=True, creates new figure. If provided, automatically enables plotting. data : DataFrame, str, or None, optional DataFrame or CSV path. When provided with value_col and group_col, groups are extracted automatically (seaborn-style). value_col : str, optional Column containing measurement values (used with data=). group_col : str, optional Column containing group labels (used with data=). return_as : {'dict', 'dataframe'}, default 'dict' Output format decimals : int, default 3 Number of decimal places for rounding verbose : bool, default False Whether to print test results Returns ------- results : dict or DataFrame Test results including: - test_method: 'One-way ANOVA' - statistic: F-statistic value - pvalue: p-value - stars: Significance stars - significant: Whether null hypothesis is rejected - effect_size: Eta-squared (η²) - effect_size_metric: 'eta-squared' - effect_size_interpretation: Interpretation of eta-squared - n_groups: Number of groups - n_samples: Sample sizes for each group - df_between: Degrees of freedom between groups - df_within: Degrees of freedom within groups - var_names: Group labels - assumptions_met: Whether assumptions are satisfied - H0: Null hypothesis description Notes ----- One-way ANOVA (Analysis of Variance) tests whether samples from different groups have the same population mean. **Null Hypothesis (H0)**: All groups have equal population means **Alternative Hypothesis (H1)**: At least one group mean differs **Assumptions**: 1. **Independence**: Observations within and between groups are independent 2. **Normality**: Data in each group are normally distributed - Can be checked with test_shapiro() - Robust to moderate violations with large samples (n > 30 per group) 3. **Homogeneity of variance**: Groups have equal population variances - Can be checked with Levene's test - If violated, consider Welch's ANOVA or non-parametric alternative **When assumptions are violated**: - Non-normality: Use test_kruskal() (Kruskal-Wallis test) - Unequal variances: Use Welch's ANOVA (not yet implemented) - Outliers present: Use test_kruskal() or remove outliers **F-Statistic**: .. math:: F = \\frac{MS_{between}}{MS_{within}} = \\frac{SS_{between}/(k-1)}{SS_{within}/(N-k)} Where: - k: Number of groups - N: Total sample size - SS: Sum of squares - MS: Mean square **Effect Size (Eta-squared)**: .. math:: \\eta^2 = \\frac{SS_{between}}{SS_{total}} Interpretation: - η² < 0.01: negligible - η² < 0.06: small - η² < 0.14: medium - η² ≥ 0.14: large **Post-hoc tests**: If significant, perform pairwise comparisons with correction: - test_ttest_ind() for all pairs (if assumptions met) - test_brunner_munzel() for all pairs (robust alternative) - correct_bonferroni() or correct_fdr() for multiple comparisons References ---------- .. [1] Fisher, R. A. (1925). Statistical Methods for Research Workers. Oliver and Boyd. .. [2] Cohen, J. (1988). Statistical Power Analysis for the Behavioral Sciences (2nd ed.). Routledge. .. [3] Maxwell, S. E., & Delaney, H. D. (2004). Designing Experiments and Analyzing Data: A Model Comparison Perspective (2nd ed.). Psychology Press. Examples -------- >>> # Three groups with different means >>> group1 = np.array([1, 2, 3, 4, 5]) >>> group2 = np.array([3, 4, 5, 6, 7]) >>> group3 = np.array([5, 6, 7, 8, 9]) >>> result = test_anova([group1, group2, group3]) >>> result['rejected'] True >>> # With auto-created figure >>> result = test_anova( ... [group1, group2, group3], ... var_names=['Control', 'Treatment 1', 'Treatment 2'], ... plot=True ... ) >>> # Plot on existing axes >>> fig, ax = plt.subplots() >>> result = test_anova([group1, group2, group3], ax=ax) >>> # Export results >>> from scitex_stats.utils._normalizers import convert_results >>> convert_results(result, return_as='excel', path='anova_results.xlsx') """ from scitex_stats._utils._effect_size import eta_squared, interpret_eta_squared from scitex_stats._utils._formatters import p2stars from scitex_stats._utils._normalizers import convert_results, force_dataframe from scitex_stats.tests.normality._test_shapiro import test_normality # Resolve groups from DataFrame (seaborn-style data= parameter) if data is not None and value_col is not None and group_col is not None: from scitex_stats._utils._csv_support import resolve_groups groups, group_names = resolve_groups(data, value_col, group_col) if var_names is None: var_names = group_names # Validate input if len(groups) < 2: raise ValueError("ANOVA requires at least 2 groups") # Convert to numpy arrays and remove NaN groups = [np.asarray(g) for g in groups] groups = [g[~np.isnan(g)] for g in groups] # Generate default names if not provided if var_names is None: var_names = [f"Group {i + 1}" for i in range(len(groups))] if len(var_names) != len(groups): raise ValueError("Number of var_names must match number of groups") # Get sample sizes n_samples = [len(g) for g in groups] n_groups = len(groups) n_total = sum(n_samples) # Check assumptions if requested assumptions_met = True assumption_warnings = [] if check_assumptions: # Check normality for each group normality_check = test_normality( *groups, var_names=var_names, alpha=alpha, warn=False ) if not normality_check["all_normal"]: assumptions_met = False non_normal = [ r["var_x"] for r in normality_check["results"] if not r["normal"] ] warning_msg = ( f"Normality assumption violated for: {', '.join(non_normal)}. " "Consider using test_kruskal() (Kruskal-Wallis test) instead." ) assumption_warnings.append(warning_msg) logger.warning(warning_msg) # Check homogeneity of variance (Levene's test) _, levene_p = stats.levene(*groups) if levene_p < alpha: assumptions_met = False warning_msg = ( f"Homogeneity of variance violated (Levene's test: p={levene_p:.4f}). " "Consider using Welch's ANOVA or test_kruskal()." ) assumption_warnings.append(warning_msg) logger.warning(warning_msg) # Perform one-way ANOVA f_result = stats.f_oneway(*groups) f_stat = float(f_result.statistic) pvalue = float(f_result.pvalue) # Determine rejection rejected = pvalue < alpha # Compute effect size (eta-squared) effect_size = eta_squared(groups) effect_size_interp = interpret_eta_squared(effect_size) # Compute degrees of freedom df_between = n_groups - 1 df_within = n_total - n_groups # Compile results result = { "test_method": "One-way ANOVA", "statistic": round(f_stat, decimals), "stat_symbol": "F", "n_groups": n_groups, "n_samples": n_samples, "df_between": df_between, "df_within": df_within, "var_names": var_names, "pvalue": round(pvalue, decimals), "stars": p2stars(pvalue), "alpha": alpha, "significant": rejected, "effect_size": round(effect_size, decimals), "effect_size_metric": "eta-squared", "effect_size_interpretation": effect_size_interp, "assumptions_met": assumptions_met, "H0": "All groups have equal population means", } # Add assumption warnings if any if assumption_warnings: result["assumption_warnings"] = assumption_warnings # Add post-hoc recommendation if significant if result["significant"]: if assumptions_met: result["recommendation"] = ( "Significant difference detected. Perform post-hoc pairwise comparisons " "with test_ttest_ind() and apply correction (correct_bonferroni or correct_fdr)." ) else: result["recommendation"] = ( "Significant difference detected, but assumptions violated. " "Consider using test_kruskal() or performing pairwise test_brunner_munzel() with correction." ) else: result["recommendation"] = "No significant difference between groups." # Log results if verbose if verbose: logger.info( f"One-way ANOVA: F({df_between}, {df_within}) = {f_stat:.3f}, p = {pvalue:.4f} {p2stars(pvalue)}" ) logger.info(f"η² = {effect_size:.3f} ({effect_size_interp})") # Auto-enable plotting if ax is provided if ax is not None: plot = True # Generate plot if requested if plot: if ax is None: fig, ax = _mpl_plt.subplots() _plot_anova(groups, var_names, result, ax) # Convert to requested format if return_as == "dataframe": result = force_dataframe(result) elif return_as not in ["dict", "dataframe"]: # Use universal converter for other formats return convert_results(result, return_as=return_as) return result
def _plot_anova(groups, var_names, result, ax): """Create violin+swarm visualization for ANOVA results on given axes.""" from scitex_stats._plot_helpers import ( significance_bracket, stats_text_box, violin_swarm, ) positions = list(np.arange(1, len(groups) + 1)) # Violin + swarm violin_swarm(ax, groups, positions, var_names) ax.set_title("One-way ANOVA") # Stats text box n_total = sum(result["n_samples"]) lines = [ fmt_stat( "F", result["statistic"], df=f"{result['df_between']}, {result['df_within']}", ), fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]), f"{fmt_sym('eta^2')} = {result['effect_size']:.3f}", f"{fmt_sym('n')} = {n_total}", ] stats_text_box(ax, lines) # Add significance bracket if result["significant"]: significance_bracket(ax, positions[0], positions[-1], result["stars"], groups) # EOF