hcitools.analysis
This module contains functions and classes for performing statistical analysis and machine learning
1""" 2This module contains functions and classes for performing statistical analysis 3and machine learning 4""" 5 6# Imports 7from sklearn.decomposition import PCA 8from sklearn.manifold import TSNE 9from umap import UMAP 10from rich import print 11 12import pandas as pd 13import numpy as np 14import re 15 16 17RANDOMSTATE = 69 18 19 20def dim_reduction( 21 data, 22 method=['pca', 'tsne', 'umap'], 23 pca_kws=None, 24 tsne_kws=None, 25 umap_kws=None 26): 27 """ 28 Perform dimensionality reduction on data 29 30 Parameters 31 ---------- 32 `data` : pd.DataFrame 33 Data frame of features. Should only contain numeric columns. 34 Metadata can be stored in the index 35 `method` : str or list, optional 36 Method(s) to use for dimensionality reduction, 37 by default ['pca', 'tsne', 'umap'] 38 `{pca, tsne, umap}_kws` : dict, optional 39 Arguments for the estimators, by default None 40 41 Returns 42 ------- 43 pd.DataFrame 44 Data frame of low-dimensional projections 45 np.array 46 Only if 'pca' in method, list of explained variances 47 """ 48 49 if isinstance(method, str): 50 method = [method] 51 method = [x.lower() for x in method] 52 for m in method: 53 assert m in ['pca', 'tsne', 'umap'], "method must be 'pca', 'tsne' or 'umap'" 54 55 if pca_kws is None: 56 pca_kws = dict(n_components=5, random_state=RANDOMSTATE) 57 else: 58 pca_kws['random_state'] = RANDOMSTATE 59 60 if tsne_kws is None: 61 tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 62 init='random', random_state=RANDOMSTATE) 63 else: 64 tsne_kws['random_state'] = RANDOMSTATE 65 66 if umap_kws is None: 67 umap_kws = dict(n_components=3, init='random', n_neighbors=20, 68 min_dist=0.2, random_state=RANDOMSTATE) 69 else: 70 umap_kws['random_state'] = RANDOMSTATE 71 72 # Initialize estimators 73 estimators = dict() 74 for m in method: 75 if m == 'pca': 76 estimators[m] = PCA(**pca_kws).fit(data) 77 elif m == 'tsne': 78 estimators[m] = TSNE(**tsne_kws) 79 elif m == 'umap': 80 estimators[m] = UMAP(**umap_kws) 81 else: 82 raise ValueError("How did you get here?") 83 84 # Compute projections 85 proj = [] 86 expvar = None 87 for m, est in estimators.items(): 88 if m == 'pca': 89 proj.append( est.transform(data) ) 90 expvar = est.explained_variance_ratio_ * 100 91 else: 92 proj.append( est.fit_transform(data) ) 93 proj = np.concatenate(proj, axis=1) 94 95 # Create column names for output data frame 96 cols = [] 97 for m in method: 98 n = estimators[m].n_components + 1 99 cols.append([f"{m.upper()} {i}" for i in range(1, n)]) 100 cols = [x for sub in cols for x in sub] 101 102 # Create data frame of projections 103 proj = (pd.DataFrame(proj, columns=cols, index=data.index) 104 .melt(ignore_index=False)) 105 proj['component'] = (proj['variable'] 106 .apply(lambda x: re.search(r'\d+', x)[0]) 107 .astype(int)) 108 proj['variable'] = (proj['variable'] 109 .apply(lambda x: re.search(r'^\w*', x)[0])) 110 proj = (proj 111 .pivot_table(values='value', columns='component', 112 index=list(data.index.names) + ['variable']) 113 .reset_index()) 114 proj.columns = proj.columns.astype(str) 115 116 return proj, expvar
def
dim_reduction( data, method=['pca', 'tsne', 'umap'], pca_kws=None, tsne_kws=None, umap_kws=None):
21def dim_reduction( 22 data, 23 method=['pca', 'tsne', 'umap'], 24 pca_kws=None, 25 tsne_kws=None, 26 umap_kws=None 27): 28 """ 29 Perform dimensionality reduction on data 30 31 Parameters 32 ---------- 33 `data` : pd.DataFrame 34 Data frame of features. Should only contain numeric columns. 35 Metadata can be stored in the index 36 `method` : str or list, optional 37 Method(s) to use for dimensionality reduction, 38 by default ['pca', 'tsne', 'umap'] 39 `{pca, tsne, umap}_kws` : dict, optional 40 Arguments for the estimators, by default None 41 42 Returns 43 ------- 44 pd.DataFrame 45 Data frame of low-dimensional projections 46 np.array 47 Only if 'pca' in method, list of explained variances 48 """ 49 50 if isinstance(method, str): 51 method = [method] 52 method = [x.lower() for x in method] 53 for m in method: 54 assert m in ['pca', 'tsne', 'umap'], "method must be 'pca', 'tsne' or 'umap'" 55 56 if pca_kws is None: 57 pca_kws = dict(n_components=5, random_state=RANDOMSTATE) 58 else: 59 pca_kws['random_state'] = RANDOMSTATE 60 61 if tsne_kws is None: 62 tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 63 init='random', random_state=RANDOMSTATE) 64 else: 65 tsne_kws['random_state'] = RANDOMSTATE 66 67 if umap_kws is None: 68 umap_kws = dict(n_components=3, init='random', n_neighbors=20, 69 min_dist=0.2, random_state=RANDOMSTATE) 70 else: 71 umap_kws['random_state'] = RANDOMSTATE 72 73 # Initialize estimators 74 estimators = dict() 75 for m in method: 76 if m == 'pca': 77 estimators[m] = PCA(**pca_kws).fit(data) 78 elif m == 'tsne': 79 estimators[m] = TSNE(**tsne_kws) 80 elif m == 'umap': 81 estimators[m] = UMAP(**umap_kws) 82 else: 83 raise ValueError("How did you get here?") 84 85 # Compute projections 86 proj = [] 87 expvar = None 88 for m, est in estimators.items(): 89 if m == 'pca': 90 proj.append( est.transform(data) ) 91 expvar = est.explained_variance_ratio_ * 100 92 else: 93 proj.append( est.fit_transform(data) ) 94 proj = np.concatenate(proj, axis=1) 95 96 # Create column names for output data frame 97 cols = [] 98 for m in method: 99 n = estimators[m].n_components + 1 100 cols.append([f"{m.upper()} {i}" for i in range(1, n)]) 101 cols = [x for sub in cols for x in sub] 102 103 # Create data frame of projections 104 proj = (pd.DataFrame(proj, columns=cols, index=data.index) 105 .melt(ignore_index=False)) 106 proj['component'] = (proj['variable'] 107 .apply(lambda x: re.search(r'\d+', x)[0]) 108 .astype(int)) 109 proj['variable'] = (proj['variable'] 110 .apply(lambda x: re.search(r'^\w*', x)[0])) 111 proj = (proj 112 .pivot_table(values='value', columns='component', 113 index=list(data.index.names) + ['variable']) 114 .reset_index()) 115 proj.columns = proj.columns.astype(str) 116 117 return proj, expvar
Perform dimensionality reduction on data
Parameters
data
(pd.DataFrame): Data frame of features. Should only contain numeric columns. Metadata can be stored in the indexmethod
(str or list, optional): Method(s) to use for dimensionality reduction, by default ['pca', 'tsne', 'umap']{pca, tsne, umap}_kws
(dict, optional): Arguments for the estimators, by default None
Returns
- pd.DataFrame: Data frame of low-dimensional projections
- np.array: Only if 'pca' in method, list of explained variances