Module edawesome.eda_help

Module with helper functions for EDA which are not just sns or pd wrappers.

Expand source code
"""Module with helper functions for EDA which are not just sns or pd wrappers."""

import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, f_oneway

import matplotlib.pyplot as plt
import seaborn as sns

import pyspark

from .str_help import snake_to_title

def _print_statistical_test(test_results):
    """Print results of statistical test"""
    print(f'p-value: {test_results.pvalue}')
    if test_results.pvalue < 0.05:
        print('Reject null hypothesis!')
    else:
        print('Accept null hypothesis!')

def compare_distributions(df, num_col_name: str, cat_col_name: str, treat_nans: str = 'drop'):
    """Compare distributions of numeric feauture by categorical feature"""

    # TODO some decorator for checking state
    # TODO some decorator for checking possible values of the arguments and raising errors

    if treat_nans not in ['drop', 'mean', 'median']:
        raise ValueError(f'Unknown treatment method {treat_nans}! Use "drop", "mean" or "median"')

    sns.displot(
        data=df,
        x=num_col_name, hue=cat_col_name,
        kind='kde', fill=True
    )
    
    plt.title(f'{snake_to_title(num_col_name)} distribution by {snake_to_title(cat_col_name)}')

    sns.despine()
    plt.show()

    sns.boxplot(
        data=df,
        x=cat_col_name, y=num_col_name
    )

    # add mean labels
    means = df.groupby(cat_col_name)[num_col_name].mean()
    for i, mean in enumerate(means):
        plt.text(i, mean, round(mean, 2), horizontalalignment='center', size='large', color='w', weight='semibold')

    sns.despine()
    plt.show()

    if treat_nans == 'drop':
        df = df.dropna()
    elif treat_nans == 'mean':
        df = df.fillna(df.mean())
    elif treat_nans == 'median':
        df = df.fillna(df.median())

    num_col = df[num_col_name]
    cat_col = df[cat_col_name]


    if df[cat_col_name].nunique() > 2:

        raise NotImplementedError('Comparison of more than 2 categories is not implemented yet!')

        # ANOVA, comparing num_col distribution by cat_col
        print('Compare means with ANOVA:')
        test_results = f_oneway(*[num_col[cat_col == cat] for cat in cat_col.unique()])
        _print_statistical_test(test_results)
        
    elif df[cat_col_name].nunique() == 2:
        print('Compare means with t-test:')

        first = num_col[cat_col == cat_col.unique()[0]]
        second = num_col[cat_col == cat_col.unique()[1]]
        _print_statistical_test(ttest_ind(first, second))
    else:
        raise ValueError('Categorical feature has only one category!')

Functions

def compare_distributions(df, num_col_name: str, cat_col_name: str, treat_nans: str = 'drop')

Compare distributions of numeric feauture by categorical feature

Expand source code
def compare_distributions(df, num_col_name: str, cat_col_name: str, treat_nans: str = 'drop'):
    """Compare distributions of numeric feauture by categorical feature"""

    # TODO some decorator for checking state
    # TODO some decorator for checking possible values of the arguments and raising errors

    if treat_nans not in ['drop', 'mean', 'median']:
        raise ValueError(f'Unknown treatment method {treat_nans}! Use "drop", "mean" or "median"')

    sns.displot(
        data=df,
        x=num_col_name, hue=cat_col_name,
        kind='kde', fill=True
    )
    
    plt.title(f'{snake_to_title(num_col_name)} distribution by {snake_to_title(cat_col_name)}')

    sns.despine()
    plt.show()

    sns.boxplot(
        data=df,
        x=cat_col_name, y=num_col_name
    )

    # add mean labels
    means = df.groupby(cat_col_name)[num_col_name].mean()
    for i, mean in enumerate(means):
        plt.text(i, mean, round(mean, 2), horizontalalignment='center', size='large', color='w', weight='semibold')

    sns.despine()
    plt.show()

    if treat_nans == 'drop':
        df = df.dropna()
    elif treat_nans == 'mean':
        df = df.fillna(df.mean())
    elif treat_nans == 'median':
        df = df.fillna(df.median())

    num_col = df[num_col_name]
    cat_col = df[cat_col_name]


    if df[cat_col_name].nunique() > 2:

        raise NotImplementedError('Comparison of more than 2 categories is not implemented yet!')

        # ANOVA, comparing num_col distribution by cat_col
        print('Compare means with ANOVA:')
        test_results = f_oneway(*[num_col[cat_col == cat] for cat in cat_col.unique()])
        _print_statistical_test(test_results)
        
    elif df[cat_col_name].nunique() == 2:
        print('Compare means with t-test:')

        first = num_col[cat_col == cat_col.unique()[0]]
        second = num_col[cat_col == cat_col.unique()[1]]
        _print_statistical_test(ttest_ind(first, second))
    else:
        raise ValueError('Categorical feature has only one category!')