hcitools.preprocess
This module contains functions for data preprocessing
1""" 2This module contains functions for data preprocessing 3""" 4 5# Imports 6from sklearn.preprocessing import StandardScaler, MinMaxScaler 7from sklearn.feature_selection import VarianceThreshold 8from rich import print 9 10import pandas as pd 11import numpy as np 12import re 13 14 15def drop_high_corr(data, thresh=0.95, method='pearson'): 16 """ 17 Remove features with correaltions above a threshold from a data frame. 18 19 Parameters 20 ---------- 21 `data` : pd.DataFrame 22 Original data frame 23 `thresh` : float, optional 24 Correlation threshold, by default 0.95 25 `method` : str, optional 26 Either 'pearson' or 'spearman', by default 'pearson' 27 28 Returns 29 ------- 30 pd.DataFrame 31 Data frame without highly correlated features 32 dict 33 Keys = features still in data frame 34 Values = list of highly correlated features 35 """ 36 37 assert 0 < thresh <= 1, "thresh must be between 0 and 1" 38 assert method in ['pearson', 'spearman'], \ 39 "Only 'pearson' or 'spearman' allowed" 40 41 # Compute correlations 42 corr = data.corr(method).abs() 43 corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)) 44 45 # Create dictionary of features to drop 46 dropped = dict() 47 for col in corr.columns: 48 I = corr[col] > thresh 49 if any(I): 50 dropped[col] = corr.columns[I].tolist() 51 52 dropping = [x for sub in dropped.values() for x in sub if x not in dropped] 53 54 return data.drop(dropping, axis=1), dropped 55 56 57def drop_low_variance(data, thresh=0.0, na_replacement=-999): 58 """ 59 Remove low-variance features from a data frame 60 61 Parameters 62 ---------- 63 `data` : pd.DataFrame 64 Original data frame 65 `thresh` : float, optional 66 Variance threshold, by default 0.0 67 `na_replacement` : int, optional 68 Replacement value for NAs, by default -999 69 70 Returns 71 ------- 72 pd.DataFrame 73 Data frame without low-variance features 74 """ 75 76 df = data.copy() 77 selector = VarianceThreshold(thresh) 78 selector.fit(df.fillna(na_replacement)) 79 80 return df.loc[:, selector.get_support(indices=False)] 81 82 83def _printif(cond, *args, **kwargs): 84 """ 85 Print if cond is true 86 """ 87 88 if cond: print(*args, **kwargs) 89 90 91def _intersperse(array, item): 92 """ 93 Insert item between each item in an array 94 """ 95 96 result = [item] * (len(array) * 2 - 1) 97 result[0::2] = array 98 return result 99 100 101def clean_data( 102 data, 103 metacols, 104 dropna=False, 105 drop_low_var=None, 106 corr_thresh=None, 107 corr_method='pearson', 108 intens_norm=False, 109 intens_rgx=r'Intensity', 110 num_objs='number of objects', 111 verbose=False 112): 113 """ 114 Perform preprocessing steps on a high-content imaging data 115 116 Parameters 117 ---------- 118 `data` : pd.DataFrame 119 Original data frame 120 `metacols` : list 121 List of non-numeric columns in data frame 122 `dropna` : bool, optional 123 Drop NA-only columns and any rows with NAs, by default False 124 `drop_low_var` : float, optional 125 Threshold for dropping low variance features, by default None 126 `corr_thresh` : float, optional 127 Threshold for dropping highly correlated features, by default None 128 `corr_method` : str, optional 129 Correlation method, by default 'spearman' 130 `intens_norm` : bool, optional 131 Should intensity-based features be normalized, by default False 132 `intens_rgx` : str, optional 133 Regular expression for identifying intensity based features, 134 by default `r'Intensity'` 135 `num_objs` : str, optional 136 Feature definining object counts, by default 'number of objects' 137 `verbose` : bool, optional 138 Should a log of processing steps be returned, by default False 139 140 Returns 141 ------- 142 pd.DataFrame 143 Preprocessed data 144 list 145 Only if `verbose == True`, preprocessing log 146 """ 147 148 for col in metacols: 149 assert col in data.columns, "Items in metacols must be columns of data" 150 if intens_norm: 151 assert num_objs in data.columns, "num_objs must be aa column of data" 152 153 # Create log 154 if verbose: 155 LOG = [f"Original data shape: {data.shape}"] 156 else: 157 LOG = None 158 159 # Store non-numeric data in index 160 data = data.set_index(metacols) 161 162 # Track dropped features 163 og_features = set(data.columns) 164 dropped = dict() 165 166 if dropna: 167 data = (data 168 .dropna(axis=1, how='all') # NA-only columns 169 .dropna(axis=0, how='any')) # Rows with NAs 170 dropped['dropna'] = list(og_features.difference(data.columns)) 171 172 if verbose: 173 LOG.append( 174 f"After removing missing data, data shape: {data.shape}" 175 ) 176 177 if drop_low_var is not None: 178 data = drop_low_variance(data, thresh=drop_low_var) 179 dropped['low_var'] = list(og_features.difference(data.columns)) 180 181 if verbose: 182 LOG.append( 183 f"After removing low-variance features, data shape: {data.shape}" 184 ) 185 186 if corr_thresh is not None: 187 data, dropped['high_corr'] = drop_high_corr(data, corr_thresh, corr_method) 188 189 if verbose: 190 LOG.append( 191 f"After removing highly correlated features, data.shape: {data.shape}" 192 ) 193 194 if intens_norm: 195 intens_features = [x for x in data.columns if re.search(intens_rgx, x)] 196 data[intens_features] = (data[intens_features] 197 .div(data[num_objs], axis=0)) 198 199 if verbose: 200 LOG.append( 201 f"Intensity-based features were normalized by '{num_objs}'" 202 ) 203 204 # Store additional metadata 205 data = data.reset_index() 206 data.attrs['metacols'] = metacols 207 data.attrs['features'] = list(set(data.columns).difference(metacols)) 208 209 return data, dropped, LOG 210 211 212def normalize(df, method='minmax'): 213 """ 214 Normalize a data frame 215 216 Parameters 217 ---------- 218 df : pd.DataFrame 219 Original data frame 220 method : str, optional 221 Either 'minmax' or 'z', by default 'minmax' 222 223 Returns 224 ------- 225 pd.DataFrame 226 Normalized data frame 227 228 Raises 229 ------ 230 NotImplementedError 231 If method isn't 'minmax' or 'z' 232 """ 233 234 if method == 'minmax': 235 X = MinMaxScaler().fit_transform(df.values) 236 elif method == 'z': 237 X = StandardScaler().fit_transform(df.values) 238 else: 239 raise NotImplementedError("Can't do that yet.") 240 241 return pd.DataFrame(X, columns=df.columns, index=df.index)
def
drop_high_corr(data, thresh=0.95, method='pearson'):
16def drop_high_corr(data, thresh=0.95, method='pearson'): 17 """ 18 Remove features with correaltions above a threshold from a data frame. 19 20 Parameters 21 ---------- 22 `data` : pd.DataFrame 23 Original data frame 24 `thresh` : float, optional 25 Correlation threshold, by default 0.95 26 `method` : str, optional 27 Either 'pearson' or 'spearman', by default 'pearson' 28 29 Returns 30 ------- 31 pd.DataFrame 32 Data frame without highly correlated features 33 dict 34 Keys = features still in data frame 35 Values = list of highly correlated features 36 """ 37 38 assert 0 < thresh <= 1, "thresh must be between 0 and 1" 39 assert method in ['pearson', 'spearman'], \ 40 "Only 'pearson' or 'spearman' allowed" 41 42 # Compute correlations 43 corr = data.corr(method).abs() 44 corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)) 45 46 # Create dictionary of features to drop 47 dropped = dict() 48 for col in corr.columns: 49 I = corr[col] > thresh 50 if any(I): 51 dropped[col] = corr.columns[I].tolist() 52 53 dropping = [x for sub in dropped.values() for x in sub if x not in dropped] 54 55 return data.drop(dropping, axis=1), dropped
Remove features with correaltions above a threshold from a data frame.
Parameters
data
(pd.DataFrame): Original data framethresh
(float, optional): Correlation threshold, by default 0.95method
(str, optional): Either 'pearson' or 'spearman', by default 'pearson'
Returns
- pd.DataFrame: Data frame without highly correlated features
- dict: Keys = features still in data frame Values = list of highly correlated features
def
drop_low_variance(data, thresh=0.0, na_replacement=-999):
58def drop_low_variance(data, thresh=0.0, na_replacement=-999): 59 """ 60 Remove low-variance features from a data frame 61 62 Parameters 63 ---------- 64 `data` : pd.DataFrame 65 Original data frame 66 `thresh` : float, optional 67 Variance threshold, by default 0.0 68 `na_replacement` : int, optional 69 Replacement value for NAs, by default -999 70 71 Returns 72 ------- 73 pd.DataFrame 74 Data frame without low-variance features 75 """ 76 77 df = data.copy() 78 selector = VarianceThreshold(thresh) 79 selector.fit(df.fillna(na_replacement)) 80 81 return df.loc[:, selector.get_support(indices=False)]
Remove low-variance features from a data frame
Parameters
data
(pd.DataFrame): Original data framethresh
(float, optional): Variance threshold, by default 0.0na_replacement
(int, optional): Replacement value for NAs, by default -999
Returns
- pd.DataFrame: Data frame without low-variance features
def
clean_data( data, metacols, dropna=False, drop_low_var=None, corr_thresh=None, corr_method='pearson', intens_norm=False, intens_rgx='Intensity', num_objs='number of objects', verbose=False):
102def clean_data( 103 data, 104 metacols, 105 dropna=False, 106 drop_low_var=None, 107 corr_thresh=None, 108 corr_method='pearson', 109 intens_norm=False, 110 intens_rgx=r'Intensity', 111 num_objs='number of objects', 112 verbose=False 113): 114 """ 115 Perform preprocessing steps on a high-content imaging data 116 117 Parameters 118 ---------- 119 `data` : pd.DataFrame 120 Original data frame 121 `metacols` : list 122 List of non-numeric columns in data frame 123 `dropna` : bool, optional 124 Drop NA-only columns and any rows with NAs, by default False 125 `drop_low_var` : float, optional 126 Threshold for dropping low variance features, by default None 127 `corr_thresh` : float, optional 128 Threshold for dropping highly correlated features, by default None 129 `corr_method` : str, optional 130 Correlation method, by default 'spearman' 131 `intens_norm` : bool, optional 132 Should intensity-based features be normalized, by default False 133 `intens_rgx` : str, optional 134 Regular expression for identifying intensity based features, 135 by default `r'Intensity'` 136 `num_objs` : str, optional 137 Feature definining object counts, by default 'number of objects' 138 `verbose` : bool, optional 139 Should a log of processing steps be returned, by default False 140 141 Returns 142 ------- 143 pd.DataFrame 144 Preprocessed data 145 list 146 Only if `verbose == True`, preprocessing log 147 """ 148 149 for col in metacols: 150 assert col in data.columns, "Items in metacols must be columns of data" 151 if intens_norm: 152 assert num_objs in data.columns, "num_objs must be aa column of data" 153 154 # Create log 155 if verbose: 156 LOG = [f"Original data shape: {data.shape}"] 157 else: 158 LOG = None 159 160 # Store non-numeric data in index 161 data = data.set_index(metacols) 162 163 # Track dropped features 164 og_features = set(data.columns) 165 dropped = dict() 166 167 if dropna: 168 data = (data 169 .dropna(axis=1, how='all') # NA-only columns 170 .dropna(axis=0, how='any')) # Rows with NAs 171 dropped['dropna'] = list(og_features.difference(data.columns)) 172 173 if verbose: 174 LOG.append( 175 f"After removing missing data, data shape: {data.shape}" 176 ) 177 178 if drop_low_var is not None: 179 data = drop_low_variance(data, thresh=drop_low_var) 180 dropped['low_var'] = list(og_features.difference(data.columns)) 181 182 if verbose: 183 LOG.append( 184 f"After removing low-variance features, data shape: {data.shape}" 185 ) 186 187 if corr_thresh is not None: 188 data, dropped['high_corr'] = drop_high_corr(data, corr_thresh, corr_method) 189 190 if verbose: 191 LOG.append( 192 f"After removing highly correlated features, data.shape: {data.shape}" 193 ) 194 195 if intens_norm: 196 intens_features = [x for x in data.columns if re.search(intens_rgx, x)] 197 data[intens_features] = (data[intens_features] 198 .div(data[num_objs], axis=0)) 199 200 if verbose: 201 LOG.append( 202 f"Intensity-based features were normalized by '{num_objs}'" 203 ) 204 205 # Store additional metadata 206 data = data.reset_index() 207 data.attrs['metacols'] = metacols 208 data.attrs['features'] = list(set(data.columns).difference(metacols)) 209 210 return data, dropped, LOG
Perform preprocessing steps on a high-content imaging data
Parameters
data
(pd.DataFrame): Original data framemetacols
(list): List of non-numeric columns in data framedropna
(bool, optional): Drop NA-only columns and any rows with NAs, by default Falsedrop_low_var
(float, optional): Threshold for dropping low variance features, by default Nonecorr_thresh
(float, optional): Threshold for dropping highly correlated features, by default Nonecorr_method
(str, optional): Correlation method, by default 'spearman'intens_norm
(bool, optional): Should intensity-based features be normalized, by default Falseintens_rgx
(str, optional): Regular expression for identifying intensity based features, by defaultr'Intensity'
num_objs
(str, optional): Feature definining object counts, by default 'number of objects'verbose
(bool, optional): Should a log of processing steps be returned, by default False
Returns
- pd.DataFrame: Preprocessed data
- list: Only if
verbose == True
, preprocessing log
def
normalize(df, method='minmax'):
213def normalize(df, method='minmax'): 214 """ 215 Normalize a data frame 216 217 Parameters 218 ---------- 219 df : pd.DataFrame 220 Original data frame 221 method : str, optional 222 Either 'minmax' or 'z', by default 'minmax' 223 224 Returns 225 ------- 226 pd.DataFrame 227 Normalized data frame 228 229 Raises 230 ------ 231 NotImplementedError 232 If method isn't 'minmax' or 'z' 233 """ 234 235 if method == 'minmax': 236 X = MinMaxScaler().fit_transform(df.values) 237 elif method == 'z': 238 X = StandardScaler().fit_transform(df.values) 239 else: 240 raise NotImplementedError("Can't do that yet.") 241 242 return pd.DataFrame(X, columns=df.columns, index=df.index)
Normalize a data frame
Parameters
- df (pd.DataFrame): Original data frame
- method (str, optional): Either 'minmax' or 'z', by default 'minmax'
Returns
- pd.DataFrame: Normalized data frame
Raises
- NotImplementedError: If method isn't 'minmax' or 'z'