Source code for scitex_stats.tests.correlation._test_theilsen

#!/usr/bin/env python3
# File: scitex_stats/tests/correlation/_test_theilsen.py
from __future__ import annotations

"""
Theil-Sen robust regression estimator.

A non-parametric regression method that is robust to outliers.
Computes the median of slopes between all pairs of points.
"""

import warnings
from typing import Union

import numpy as np
import pandas as pd
from scipy import stats


[docs] def test_theilsen( x: Union[np.ndarray, pd.Series, list, str], y: Union[np.ndarray, pd.Series, list, str], var_x: str = "x", var_y: str = "y", data: Union[pd.DataFrame, str, None] = None, return_as: str = "dict", verbose: bool = True, ) -> Union[dict, pd.DataFrame]: """ Theil-Sen robust regression estimator. A robust non-parametric regression method that estimates the slope as the median of all pairwise slopes. Highly resistant to outliers (up to 29.3% breakdown point). Parameters ---------- x : array-like Independent variable y : array-like Dependent variable var_x : str, default="x" Name of independent variable (for reporting) var_y : str, default="y" Name of dependent variable (for reporting) data : DataFrame, str, or None, optional DataFrame or CSV path. When provided, string values for x/y are resolved as column names (seaborn-style). return_as : str, default="dict" Format of return value: "dict" or "dataframe" verbose : bool, default=True Whether to print results Returns ------- dict or pd.DataFrame Dictionary or DataFrame containing: - slope : float Theil-Sen slope estimate (median of pairwise slopes) - intercept : float Intercept of the regression line - low_slope : float Lower bound of slope confidence interval - high_slope : float Upper bound of slope confidence interval - var_x : str Name of independent variable - var_y : str Name of dependent variable Notes ----- The Theil-Sen estimator: - Is robust to outliers (up to ~29% outliers) - Has no distributional assumptions - Is asymptotically normal - Has ~64% efficiency compared to OLS for normal data - Computational complexity: O(n²) References ---------- .. [1] Theil, H. (1950). "A rank-invariant method of linear and polynomial regression analysis". Indagationes Mathematicae, 12, 85-91. .. [2] Sen, P.K. (1968). "Estimates of the regression coefficient based on Kendall's tau". Journal of the American Statistical Association, 63, 1379-1389. Examples -------- >>> import numpy as np >>> from scitex_stats.tests.correlation import test_theilsen >>> x = np.array([1, 2, 3, 4, 5]) >>> y = np.array([2, 4, 6, 8, 10]) >>> result = test_theilsen(x, y, verbose=False) >>> print(f"Slope: {result['slope']:.3f}") Slope: 2.000 >>> # With outlier >>> y_outlier = np.array([2, 4, 6, 8, 100]) # One extreme outlier >>> result = test_theilsen(x, y_outlier, verbose=False) >>> print(f"Robust slope: {result['slope']:.3f}") Robust slope: 2.000 """ # Resolve column names from DataFrame (seaborn-style data= parameter) if data is not None: from scitex_stats._utils._csv_support import resolve_columns resolved = resolve_columns(data, x=x, y=y) x, y = resolved["x"], resolved["y"] # Convert inputs to numpy arrays x = np.asarray(x).flatten() y = np.asarray(y).flatten() # Remove NaN/Inf mask = np.isfinite(x) & np.isfinite(y) if not mask.all(): n_removed = (~mask).sum() warnings.warn( f"Removed {n_removed} NaN/Inf values from input data", UserWarning, ) x = x[mask] y = y[mask] # Check for sufficient data if len(x) < 3: raise ValueError(f"Need at least 3 valid data points, got {len(x)}") # Check for variation if len(np.unique(x)) < 2: raise ValueError("Independent variable has no variation (constant values)") # Compute Theil-Sen estimator result = stats.theilslopes(y, x, alpha=0.95) # Prepare output dictionary output = { "slope": result.slope, "intercept": result.intercept, "low_slope": result.low_slope, "high_slope": result.high_slope, "var_x": var_x, "var_y": var_y, "H0": f"No linear association between {var_x} and {var_y} (slope = 0)", } # Print results if verbose if verbose: print("\n" + "=" * 70) print("Theil-Sen Robust Regression") print("=" * 70) print(f"Variables: {var_y} ~ {var_x}") print(f"Sample size: n = {len(x)}") print("\nResults:") print(f" Slope: {output['slope']:.6f}") print(f" Intercept: {output['intercept']:.6f}") print(f" 95% CI: [{output['low_slope']:.6f}, {output['high_slope']:.6f}]") print("\nInterpretation:") print(f" For each unit increase in {var_x},") print(f" {var_y} changes by {output['slope']:.6f} units (median slope)") print("=" * 70 + "\n") # Return as requested format if return_as == "dataframe": return pd.DataFrame([output]) elif return_as == "dict": return output else: raise ValueError(f"return_as must be 'dict' or 'dataframe', got '{return_as}'")
if __name__ == "__main__": # Example usage np.random.seed(42) # Clean data print("Example 1: Clean linear data") x = np.linspace(0, 10, 50) y = 2 * x + 1 + np.random.normal(0, 1, 50) result = test_theilsen(x, y, var_x="x", var_y="y") # Data with outliers print("\n" + "=" * 70) print("Example 2: Data with outliers") x_out = np.linspace(0, 10, 50) y_out = 2 * x_out + 1 + np.random.normal(0, 1, 50) # Add outliers y_out[[10, 20, 30]] += np.array([20, -15, 25]) result_out = test_theilsen(x_out, y_out, var_x="x", var_y="y") # Compare with OLS from scipy.stats import linregress ols = linregress(x_out, y_out) print("\nComparison:") print(f" Theil-Sen slope: {result_out['slope']:.6f}") print(f" OLS slope: {ols.slope:.6f}") print(" True slope: 2.000000") print( f"\nTheil-Sen is more robust to the {((y_out[[10, 20, 30]] - (2 * x_out[[10, 20, 30]] + 1)) != 0).sum()} outliers!" )