hcitools.analysis

This module contains functions and classes for performing statistical analysis and machine learning

  1"""
  2This module contains functions and classes for performing statistical analysis 
  3and machine learning
  4"""
  5
  6# Imports
  7from sklearn.decomposition import PCA
  8from sklearn.manifold import TSNE
  9from umap import UMAP
 10from rich import print
 11
 12import pandas as pd
 13import numpy as np
 14import re
 15
 16
 17RANDOMSTATE = 69
 18
 19
 20def dim_reduction(
 21    data,
 22    method=['pca', 'tsne', 'umap'],
 23    pca_kws=None,
 24    tsne_kws=None,
 25    umap_kws=None
 26):
 27    """
 28    Perform dimensionality reduction on data 
 29
 30    Parameters
 31    ----------
 32    `data` : pd.DataFrame
 33        Data frame of features. Should only contain numeric columns.
 34        Metadata can be stored in the index
 35    `method` : str or list, optional
 36        Method(s) to use for dimensionality reduction, 
 37        by default ['pca', 'tsne', 'umap']
 38    `{pca, tsne, umap}_kws` : dict, optional
 39        Arguments for the estimators, by default None
 40
 41    Returns
 42    -------
 43    pd.DataFrame
 44        Data frame of low-dimensional projections
 45    np.array
 46        Only if 'pca' in method, list of explained variances
 47    """
 48
 49    if isinstance(method, str):
 50        method = [method]
 51    method = [x.lower() for x in method]
 52    for m in method:
 53        assert m in ['pca', 'tsne', 'umap'], "method must be 'pca', 'tsne' or 'umap'"
 54
 55    if pca_kws is None:
 56        pca_kws = dict(n_components=5, random_state=RANDOMSTATE)
 57    else:
 58        pca_kws['random_state'] = RANDOMSTATE
 59
 60    if tsne_kws is None:
 61        tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 
 62                        init='random', random_state=RANDOMSTATE)
 63    else:
 64        tsne_kws['random_state'] = RANDOMSTATE
 65
 66    if umap_kws is None:
 67        umap_kws = dict(n_components=3, init='random', n_neighbors=20, 
 68                        min_dist=0.2, random_state=RANDOMSTATE)
 69    else:
 70        umap_kws['random_state'] = RANDOMSTATE
 71
 72    # Initialize estimators
 73    estimators = dict()
 74    for m in method:
 75        if m == 'pca':
 76            estimators[m] = PCA(**pca_kws).fit(data)
 77        elif m == 'tsne':
 78            estimators[m] = TSNE(**tsne_kws)
 79        elif m == 'umap':
 80            estimators[m] = UMAP(**umap_kws)
 81        else:
 82            raise ValueError("How did you get here?")
 83
 84    # Compute projections
 85    proj = []
 86    expvar = None
 87    for m, est in estimators.items():
 88        if m == 'pca':
 89            proj.append( est.transform(data) )
 90            expvar = est.explained_variance_ratio_ * 100
 91        else:
 92            proj.append( est.fit_transform(data) )
 93    proj = np.concatenate(proj, axis=1)
 94
 95    # Create column names for output data frame
 96    cols = []
 97    for m in method:
 98        n = estimators[m].n_components + 1
 99        cols.append([f"{m.upper()} {i}" for i in range(1, n)])
100    cols = [x for sub in cols for x in sub]
101
102    # Create data frame of projections
103    proj = (pd.DataFrame(proj, columns=cols, index=data.index)
104        .melt(ignore_index=False))
105    proj['component'] = (proj['variable']
106        .apply(lambda x: re.search(r'\d+', x)[0])
107        .astype(int))
108    proj['variable'] = (proj['variable']
109        .apply(lambda x: re.search(r'^\w*', x)[0]))
110    proj = (proj
111        .pivot_table(values='value', columns='component',
112                     index=list(data.index.names) + ['variable'])
113        .reset_index())
114    proj.columns = proj.columns.astype(str)
115
116    return proj, expvar
def dim_reduction( data, method=['pca', 'tsne', 'umap'], pca_kws=None, tsne_kws=None, umap_kws=None):
 21def dim_reduction(
 22    data,
 23    method=['pca', 'tsne', 'umap'],
 24    pca_kws=None,
 25    tsne_kws=None,
 26    umap_kws=None
 27):
 28    """
 29    Perform dimensionality reduction on data 
 30
 31    Parameters
 32    ----------
 33    `data` : pd.DataFrame
 34        Data frame of features. Should only contain numeric columns.
 35        Metadata can be stored in the index
 36    `method` : str or list, optional
 37        Method(s) to use for dimensionality reduction, 
 38        by default ['pca', 'tsne', 'umap']
 39    `{pca, tsne, umap}_kws` : dict, optional
 40        Arguments for the estimators, by default None
 41
 42    Returns
 43    -------
 44    pd.DataFrame
 45        Data frame of low-dimensional projections
 46    np.array
 47        Only if 'pca' in method, list of explained variances
 48    """
 49
 50    if isinstance(method, str):
 51        method = [method]
 52    method = [x.lower() for x in method]
 53    for m in method:
 54        assert m in ['pca', 'tsne', 'umap'], "method must be 'pca', 'tsne' or 'umap'"
 55
 56    if pca_kws is None:
 57        pca_kws = dict(n_components=5, random_state=RANDOMSTATE)
 58    else:
 59        pca_kws['random_state'] = RANDOMSTATE
 60
 61    if tsne_kws is None:
 62        tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 
 63                        init='random', random_state=RANDOMSTATE)
 64    else:
 65        tsne_kws['random_state'] = RANDOMSTATE
 66
 67    if umap_kws is None:
 68        umap_kws = dict(n_components=3, init='random', n_neighbors=20, 
 69                        min_dist=0.2, random_state=RANDOMSTATE)
 70    else:
 71        umap_kws['random_state'] = RANDOMSTATE
 72
 73    # Initialize estimators
 74    estimators = dict()
 75    for m in method:
 76        if m == 'pca':
 77            estimators[m] = PCA(**pca_kws).fit(data)
 78        elif m == 'tsne':
 79            estimators[m] = TSNE(**tsne_kws)
 80        elif m == 'umap':
 81            estimators[m] = UMAP(**umap_kws)
 82        else:
 83            raise ValueError("How did you get here?")
 84
 85    # Compute projections
 86    proj = []
 87    expvar = None
 88    for m, est in estimators.items():
 89        if m == 'pca':
 90            proj.append( est.transform(data) )
 91            expvar = est.explained_variance_ratio_ * 100
 92        else:
 93            proj.append( est.fit_transform(data) )
 94    proj = np.concatenate(proj, axis=1)
 95
 96    # Create column names for output data frame
 97    cols = []
 98    for m in method:
 99        n = estimators[m].n_components + 1
100        cols.append([f"{m.upper()} {i}" for i in range(1, n)])
101    cols = [x for sub in cols for x in sub]
102
103    # Create data frame of projections
104    proj = (pd.DataFrame(proj, columns=cols, index=data.index)
105        .melt(ignore_index=False))
106    proj['component'] = (proj['variable']
107        .apply(lambda x: re.search(r'\d+', x)[0])
108        .astype(int))
109    proj['variable'] = (proj['variable']
110        .apply(lambda x: re.search(r'^\w*', x)[0]))
111    proj = (proj
112        .pivot_table(values='value', columns='component',
113                     index=list(data.index.names) + ['variable'])
114        .reset_index())
115    proj.columns = proj.columns.astype(str)
116
117    return proj, expvar

Perform dimensionality reduction on data

Parameters
  • data (pd.DataFrame): Data frame of features. Should only contain numeric columns. Metadata can be stored in the index
  • method (str or list, optional): Method(s) to use for dimensionality reduction, by default ['pca', 'tsne', 'umap']
  • {pca, tsne, umap}_kws (dict, optional): Arguments for the estimators, by default None
Returns
  • pd.DataFrame: Data frame of low-dimensional projections
  • np.array: Only if 'pca' in method, list of explained variances