#!/usr/bin/env python3
# Timestamp: "2025-10-01 17:30:00 (ywatanabe)"
# File: scitex_stats/tests/normality/_test_ks_1samp.py
# ----------------------------------------
from __future__ import annotations
import os
__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------
r"""
Functionalities:
- Perform one-sample Kolmogorov-Smirnov test (compare to reference distribution)
- Generate CDF comparison plots
- Support flexible output formats (dict or DataFrame)
Dependencies:
- packages: numpy, pandas, scipy, matplotlib
IO:
- input: One sample (array or Series)
- output: Test results (dict or DataFrame) and optional figure
"""
"""Imports"""
import argparse # noqa: E402
from typing import Callable, Literal, Optional, Union # noqa: E402
import matplotlib.axes # noqa: E402
import matplotlib.pyplot as _mpl_plt # noqa: E402
import matplotlib.pyplot as plt # noqa: E402
import numpy as np # noqa: E402
import pandas as pd # noqa: E402
from scipy import stats # noqa: E402
from scitex_stats._logging import getLogger
from scitex_stats._utils._formatters import fmt_stat, fmt_sym # noqa: E402
logger = getLogger(__name__)
"""Functions"""
[docs]
def test_ks_1samp( # noqa: C901
x: Union[np.ndarray, pd.Series, str],
cdf: Union[str, Callable] = "norm",
args: tuple = (),
var_x: str = "x",
alternative: Literal["two-sided", "less", "greater"] = "two-sided",
alpha: float = 0.05,
plot: bool = False,
ax: Optional[matplotlib.axes.Axes] = None,
data: Union[pd.DataFrame, str, None] = None,
return_as: Literal["dict", "dataframe"] = "dict",
decimals: int = 3,
verbose: bool = False,
) -> Union[dict, pd.DataFrame]:
r"""
Perform one-sample Kolmogorov-Smirnov test.
Parameters
----------
x : array or Series
Sample to test
cdf : str or callable, default 'norm'
Reference distribution. Either:
- String: 'norm', 'uniform', 'expon', etc. (scipy.stats distribution name)
- Callable: CDF function
args : tuple, default ()
Distribution parameters (e.g., (loc, scale) for normal)
var_x : str, default 'x'
Label for sample
alternative : {'two-sided', 'less', 'greater'}, default 'two-sided'
Alternative hypothesis
alpha : float, default 0.05
Significance level
plot : bool, default False
Whether to generate CDF comparison plot
data : DataFrame, str, or None, optional
DataFrame or CSV path. When provided, string value for x
is resolved as a column name (seaborn-style).
return_as : {'dict', 'dataframe'}, default 'dict'
Output format
decimals : int, default 3
Number of decimal places for rounding
Returns
-------
results : dict or DataFrame
Test results including:
- test_method: 'Kolmogorov-Smirnov test (1-sample)'
- statistic_name: 'D'
- statistic: KS D-statistic (maximum CDF difference)
- pvalue: p-value
- pstars: Significance stars
- rejected: Whether null hypothesis is rejected
- n_x: Sample size
- var_x: Variable label
- reference_distribution: Name of reference distribution
- H0: Null hypothesis description
fig : matplotlib.figure.Figure, optional
Figure with CDF comparison (only if plot=True)
Notes
-----
The one-sample Kolmogorov-Smirnov test compares the empirical cumulative
distribution function (ECDF) of the sample against a reference CDF.
**Null Hypothesis (H0)**: Data follow the specified distribution
**Test Statistic D**:
.. math::
D = \\sup_x |F_n(x) - F(x)|
Where:
- F_n(x): Empirical CDF of sample
- F(x): Reference CDF
**Advantages**:
- Distribution-free (no assumptions about data)
- Can test against any continuous distribution
- More general than Shapiro-Wilk (not limited to normality)
**Disadvantages**:
- Less powerful than Shapiro-Wilk for normality testing
- Sensitive to sample size (large n → high power, may detect trivial deviations)
- Assumes continuous distribution (not suitable for discrete data)
**When to use**:
- Testing goodness-of-fit to any continuous distribution
- Comparing sample to theoretical distribution
- When Shapiro-Wilk is not applicable (non-normal distributions)
- Large sample sizes (n > 50)
References
----------
.. [1] Kolmogorov, A. (1933). "Sulla determinazione empirica di una legge
di distribuzione". Giornale dell'Istituto Italiano degli Attuari, 4, 83-91.
.. [2] Smirnov, N. (1948). "Table for estimating the goodness of fit of
empirical distributions". Annals of Mathematical Statistics, 19(2), 279-281.
Examples
--------
>>> # Test if data are normally distributed
>>> x = np.random.normal(0, 1, 100)
>>> result = test_ks_1samp(x, cdf='norm', args=(0, 1))
>>> result['rejected']
False
>>> # Test if data are uniformly distributed
>>> x = np.random.uniform(0, 1, 100)
>>> result = test_ks_1samp(x, cdf='uniform', args=(0, 1))
"""
from scitex_stats._utils._formatters import p2stars
from scitex_stats._utils._normalizers import convert_results, force_dataframe
# Resolve column names from DataFrame (seaborn-style data= parameter)
if data is not None:
from scitex_stats._utils._csv_support import resolve_columns
resolved = resolve_columns(data, x=x)
x = resolved["x"]
# Convert to numpy array and remove NaN
x = np.asarray(x)
x = x[~np.isnan(x)]
n_x = len(x)
# Check sample size
if n_x < 3:
raise ValueError("KS test requires at least 3 observations")
# Get reference distribution
if isinstance(cdf, str):
ref_dist_name = cdf
# Get scipy distribution
ref_dist = getattr(stats, cdf)
if args:
def cdf_func(t): # noqa: E731
return ref_dist.cdf(t, *args)
else:
cdf_func = ref_dist.cdf
else:
ref_dist_name = "custom"
cdf_func = cdf
# Perform KS test
ks_result = stats.ks_1samp(x, cdf_func, alternative=alternative)
d_stat = float(ks_result.statistic)
pvalue = float(ks_result.pvalue)
# Determine if distribution matches
rejected = pvalue < alpha
matches = not rejected
# Compile results
result = {
"test_method": "Kolmogorov-Smirnov test (1-sample)",
"statistic": round(d_stat, decimals),
"stat_symbol": "D",
"n": n_x,
"var_x": var_x,
"pvalue": round(pvalue, decimals),
"stars": p2stars(pvalue),
"alpha": alpha,
"significant": rejected,
"matches_distribution": matches,
"reference_distribution": ref_dist_name,
"H0": f"Data follow {ref_dist_name} distribution",
}
# Log results if verbose
if verbose:
logger.info(
f"KS test (1-sample): D = {d_stat:.3f}, p = {pvalue:.4f} {p2stars(pvalue)}"
)
logger.info(f"Matches {ref_dist_name}: {matches}")
# Auto-enable plotting if ax is provided
if ax is not None:
plot = True
# Generate plot if requested
if plot:
if ax is None:
fig, axes = _mpl_plt.subplots(1, 2, figsize=(14, 6))
_plot_ks_1samp_full(x, cdf_func, var_x, result, ref_dist_name, axes)
else:
_plot_ks_1samp_simple(x, cdf_func, var_x, result, ref_dist_name, ax)
# Convert to requested format
if return_as == "dataframe":
result = force_dataframe(result)
elif return_as not in ["dict", "dataframe"]:
return convert_results(result, return_as=return_as)
return result
def _plot_ks_1samp_full(x, cdf_func, var_x, result, ref_dist_name, axes):
"""Create 2-panel CDF comparison plot for one-sample KS test."""
from scitex_stats._plot_helpers import stats_text_box
# Plot 1: CDF comparison
ax = axes[0]
# Compute ECDF
x_sorted = np.sort(x)
ecdf = np.arange(1, len(x) + 1) / len(x)
# Compute reference CDF
ref_cdf = cdf_func(x_sorted)
# Plot both CDFs
ax.step(
x_sorted,
ecdf,
where="post",
label=f"Empirical ({var_x})",
)
ax.plot(
x_sorted,
ref_cdf,
label=f"Reference ({ref_dist_name})",
)
# Mark maximum difference
diff = np.abs(ecdf - ref_cdf)
max_idx = np.argmax(diff)
ax.vlines(
x_sorted[max_idx],
ecdf[max_idx],
ref_cdf[max_idx],
linestyles="dashed",
label=f"D = {result['statistic']:.3f}",
)
ax.set_xlabel("Value")
ax.set_ylabel("Cumulative Probability")
ax.set_title("KS Test (1-sample)")
ax.legend()
# Add text with results
stats_text_box(
ax,
[
fmt_stat("D", result["statistic"]),
fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]),
f"Matches: {result['matches_distribution']}",
f"{fmt_sym('n')} = {result['n']}",
],
)
# Plot 2: Histogram with reference PDF
ax = axes[1]
ax.hist(x, bins="auto", density=True, label="Data")
# If reference is a known distribution, plot PDF
if ref_dist_name != "custom":
x_range = np.linspace(np.min(x), np.max(x), 200)
ref_dist = getattr(stats, ref_dist_name)
# Try to get PDF
try:
if hasattr(ref_dist, "pdf"):
pdf_vals = ref_dist.pdf(x_range)
ax.plot(x_range, pdf_vals, label=f"{ref_dist_name} PDF")
except Exception: # noqa: E722
pass
ax.set_xlabel("Value")
ax.set_ylabel("Density")
ax.set_title("Histogram")
ax.legend()
def _plot_ks_1samp_simple(x, cdf_func, var_x, result, ref_dist_name, ax):
"""Create single CDF comparison plot on provided axes."""
from scitex_stats._plot_helpers import stats_text_box
# Compute ECDF
x_sorted = np.sort(x)
ecdf = np.arange(1, len(x) + 1) / len(x)
# Compute reference CDF
ref_cdf = cdf_func(x_sorted)
# Plot both CDFs
ax.step(
x_sorted,
ecdf,
where="post",
label=f"Empirical ({var_x})",
)
ax.plot(
x_sorted,
ref_cdf,
label=f"Reference ({ref_dist_name})",
)
# Mark maximum difference
diff = np.abs(ecdf - ref_cdf)
max_idx = np.argmax(diff)
ax.vlines(
x_sorted[max_idx],
ecdf[max_idx],
ref_cdf[max_idx],
linestyles="dashed",
label=f"D = {result['statistic']:.3f}",
)
ax.set_xlabel("Value")
ax.set_ylabel("Cumulative Probability")
ax.set_title("KS Test (1-sample)")
ax.legend()
# Add text with results
stats_text_box(
ax,
[
fmt_stat("D", result["statistic"]),
fmt_stat("p", result["pvalue"], fmt=".4f", stars=result["stars"]),
f"Matches: {result['matches_distribution']}",
f"{fmt_sym('n')} = {result['n']}",
],
)
"""Main function"""
def main(args):
"""Demonstrate one-sample Kolmogorov-Smirnov test functionality."""
logger.info("Demonstrating one-sample Kolmogorov-Smirnov test")
# Set random seed
np.random.seed(42)
# Example 1: One-sample test - normal data
logger.info("\n=== Example 1: One-sample KS test (normal data) ===")
x_normal = np.random.normal(0, 1, 100)
result1 = test_ks_1samp(
x_normal, cdf="norm", args=(0, 1), var_x="Normal data", verbose=True
)
# Example 2: One-sample test - exponential data tested against normal
logger.info("\n=== Example 2: One-sample KS test (exponential data vs normal) ===")
x_exp = np.random.exponential(2, 100)
result2 = test_ks_1samp(
x_exp,
cdf="norm",
args=(np.mean(x_exp), np.std(x_exp)),
var_x="Exponential data",
verbose=True,
)
# Example 3: One-sample test with visualization
logger.info("\n=== Example 3: One-sample KS test with visualization ===")
x_mixed = np.concatenate([np.random.normal(0, 1, 90), np.random.normal(3, 1, 10)])
result3 = test_ks_1samp(
x_mixed,
cdf="norm",
args=(0, 1),
var_x="Mixed data",
plot=True,
verbose=True,
)
plt.gcf().savefig("./ks_1samp_example.jpg")
plt.close("all")
# Example 4: Export results
logger.info("\n=== Example 4: Export results ===")
from scitex_stats._utils._normalizers import force_dataframe
test_results = [result1, result2, result3]
df = force_dataframe(test_results)
logger.info(f"\nDataFrame shape: {df.shape}")
df.to_excel("./ks_1samp_results.xlsx", index=False)
logger.info("Results exported to ./ks_1samp_results.xlsx")
return 0
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Demonstrate one-sample Kolmogorov-Smirnov test"
)
parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
return parser.parse_args()
def run_main():
"""Run main without the scitex umbrella session helpers."""
import matplotlib
matplotlib.use("Agg")
args = parse_args()
return main(args)
if __name__ == "__main__":
run_main()
# EOF