hcitools.process
hcitools : process -------- ------- Functions for loading and processing data.
1""" 2hcitools : process 3-------- ------- 4Functions for loading and processing data. 5""" 6 7# Imports 8from sklearn.preprocessing import StandardScaler, MinMaxScaler 9from sklearn.feature_selection import VarianceThreshold 10from sklearn.decomposition import PCA 11from sklearn.manifold import TSNE 12from umap import UMAP 13 14# from dash.html import P 15 16import pandas as pd 17import numpy as np 18import pathlib 19import base64 20import io 21import re 22 23# Paths to Data Files 24BASE_PATH = pathlib.Path(__file__).parent.resolve() 25ASSET_PATH = BASE_PATH.joinpath('assets/').resolve() 26 27# Define feature groups 28groups = { 29 'Nucleus': {'rgx': r'Nucleus|Number of Objects|HOECHST 33342', 30 'col': '#008AC9'}, 31 'TMRM': {'rgx': r'Alexa 594', 32 'col': '#FFC000'}, 33 'ROS': {'rgx': r'Alexa 488', 34 'col': '#70AD47'}, 35 'Mito Mass': {'rgx': r'Alexa 647|spots|MCTracker Deep Red', 36 'col': '#D75156'}, 37} 38 39 40## ------------------------------ DEFINITIONS ------------------------------- ## 41 42def drop_high_corr(data: pd.DataFrame, thresh: float=0.95): 43 """ 44 Remove features with high pearson correlations (> thresh) from a dataframe 45 """ 46 47 # Compute correlations 48 corr = data.corr('pearson').abs() 49 corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)) 50 51 # Create a dictionary to keep track of features that exhibit high 52 # correlations 53 # keys = feature to keep 54 # values = features to drop 55 dropped = dict() 56 for col in corr.columns: 57 I = corr[col] > thresh 58 if any(I): 59 dropped[col] = corr.columns[I].tolist() 60 61 # List of features to drop 62 drop = [x for sub in dropped.values() for x in sub if x not in dropped] 63 64 return data.drop(drop, axis=1), drop 65 66 67def drop_low_variance(data: pd.DataFrame, thresh: float=0.0, 68 na_replacement: int=-999): 69 """ 70 Remove all low-variance features from a dataframe. 71 """ 72 73 df = data.copy() 74 selector = VarianceThreshold(thresh) 75 selector.fit(df.fillna(na_replacement)) 76 77 return df.loc[:, selector.get_support(indices=False)] 78 79 80def printif(cond: bool, *args, **kwargs): 81 # Print if cond is true 82 if cond: print(*args, **kwargs) 83 84 85def intersperse(lst, item): 86 # Insert item between each entry in lst 87 result = [item] * (len(lst) * 2 - 1) 88 result[0::2] = lst 89 return result 90 91 92# def clean_data(data, meta_cols, dropna=False, drop_low_var=False, 93# cor_thr=None, intens_norm=False, log=False, 94# num_objs = 'non-border cells - number of objects'): 95# """ 96# Clean a dataset and perform necessary preprocessing 97 98# Parameters 99# ---------- 100# data : pd.DataFrame 101# Dataframe 102# meta_cols : list 103# List of metadata columns 104# dropna : bool 105# Drop NA-only columns and any rows with NAs 106# drop_low_var : bool 107# Drop zero variance features 108# corr_thr : float 109# Correlation threshold. Remove features with correlations > corr_thr 110# intens_norm : bool 111# Should intensity-based features be normalized? 112# log : bool 113# Return log of preprocessing steps 114 115# Returns 116# ------- 117# Data, Metadata and LOG 118# """ 119 120# # Create a 'Compound Concentration' column 121# data['cmpd_conc'] = (data['compound'].astype(str) + 122# ' (' + data['conc'].astype(str) + ')') 123# meta_cols.append('cmpd_conc') 124 125# # Sort data so that metadata columns are first 126# OTHERCOLS = [x for x in data.columns if x not in meta_cols] 127# data = data[meta_cols + OTHERCOLS].set_index(meta_cols) 128 129# # Initialize log 130# if log: 131# LOG = [P(f"Original data shape: {data.shape}")] 132# else: 133# LOG = None 134 135# # Keep track of any dropped features 136# original_features = set(data.columns).difference(meta_cols) 137# dropped = dict() 138 139# if dropna: 140# data = (data.dropna(axis=1, how='all') # NA-only columns 141# .dropna(axis=0, how='any')) # Rows with NAs 142# dropped['missing'] = list( original_features.difference(data.columns) ) 143# if log: 144# LOG += [P(f"After dropping NAs, data.shape: {data.shape}")] 145 146# if drop_low_var: 147# data = drop_low_variance(data, thresh=0.0) 148# dropped['low var'] = list( original_features.difference(data.columns) ) 149# if log: 150# LOG += [P(f"After removing zero-variance features, data shape: {data.shape}")] 151 152# # Normalize intensity based features by number of objects 153# # This assumes that the `number of objects` is in the column `num_objs` 154# if intens_norm: 155# intens_features = [x for x in data.columns if re.search('Intensity', x)] 156# data[intens_features] = (data[intens_features] 157# .div(data[num_objs], axis=0)) 158 159# if log: 160# LOG += [P(f"Intensity-based features were normalized by '{num_objs}'")] 161 162# if cor_thr is not None: 163# data, dropped['high corr'] = drop_high_corr(data, thresh=cor_thr) 164# if log: 165# LOG += [P(f"After removing highly correlated features, data shape: {data.shape}")] 166 167# # Keep track of which columns are metadata 168# data = data.reset_index() 169# data.attrs['meta_cols'] = meta_cols 170# data.attrs['features'] = list( set(data.columns).difference(meta_cols) ) 171# data.attrs['compounds'] = data['compound'].unique().tolist() 172 173# return data, dropped, LOG 174 175 176def normalize_df(df, method='MinMax'): 177 """ 178 Normalize a data frame 179 """ 180 181 if method == 'MinMax': 182 X = MinMaxScaler().fit_transform(df.values) 183 elif method == 'z': 184 X = StandardScaler().fit_transform(df.values) 185 else: 186 raise ValueError("method must be one of 'MinMax' or 'z'") 187 188 return pd.DataFrame(data=X, columns=df.columns, index=df.index) 189 190 191def dim_reduction(data, method=['PCA', 'tSNE', 'UMAP'], 192 pca_kws=None, tsne_kws=None, umap_kws=None) -> pd.DataFrame: 193 """ 194 Perfrom Dimensionality Reduction on data and Return the projections 195 196 Parameters 197 ---------- 198 data : pd.DataFrame 199 Data set. Should have any metadata columns in its index only. 200 method : str | list 201 What dimensionality reduction technique to use. 202 Must be one or / or a combination of: 'PCA', 'tSNE', 'UMAP' 203 """ 204 205 data.columns = [x.lower() for x in data.columns] 206 207 if isinstance(method, str): 208 method = [method] 209 method = [x.lower() for x in method] 210 if np.all([x not in ['pca', 'tsne', 'umap'] for x in method]): 211 raise ValueError("method must be one of 'UMAP', 'tSNE', or 'PCA'") 212 213 if pca_kws is None: 214 pca_kws = dict(n_components=5, random_state=69) 215 else: 216 pca_kws['random_state'] = 69 217 218 if tsne_kws is None: 219 tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 220 init='random', random_state=69) 221 else: 222 tsne_kws['random_state'] = 69 223 224 if umap_kws is None: 225 umap_kws = dict(n_components=3, init='random', n_neighbors=20, 226 min_dist=0.2, random_state=69) 227 else: 228 umap_kws['random_state'] = 69 229 230 # Initiazte transformers 231 transformers = dict() 232 for m in method: 233 if m == 'pca': 234 transformers[m] = PCA(**pca_kws).fit(data) 235 elif m == 'tsne': 236 transformers[m] = TSNE(**tsne_kws) 237 elif m == 'umap': 238 transformers[m] = UMAP(**umap_kws) 239 else: 240 raise ValueError("You're not supposed to be here.") 241 242 # Compute projections 243 proj = [] 244 for m, tr in transformers.items(): 245 if m == 'pca': 246 proj.append( tr.transform(data) ) 247 else: 248 proj.append( tr.fit_transform(data) ) 249 proj = np.concatenate(proj, axis=1) 250 251 # Create column names for output data frame 252 cols = [] 253 for m in method: 254 if m == 'pca': 255 n = transformers['pca'].n_components + 1 256 cols.append([f'PCA {x}' for x in range(1,n)] ) 257 else: 258 cols.append([f'{m.upper()} {x}' for x in range(1,4)]) 259 cols = [x for sub in cols for x in sub] 260 261 # Prepare dataframe 262 proj = pd.DataFrame( 263 data=proj, 264 columns=cols, 265 index=data.index 266 ).melt(ignore_index=False) 267 proj['comp'] = proj.variable.apply(lambda x: re.search(r'\d+', x)[0]).astype(int) 268 proj['variable'] = proj.variable.apply(lambda x: re.search(r'^\w*', x)[0]) 269 proj = (proj 270 .pivot_table(values='value', columns='comp', 271 index=list(data.index.names) + ['variable']) 272 .reset_index()) 273 proj.columns = proj.columns.astype(str) 274 275 if 'pca' in method: 276 exvar = transformers['pca'].explained_variance_ratio_ * 100 277 return proj, exvar 278 279 return proj 280 281 282def assign_groups(features, groups=groups): 283 """ 284 Assign a list (array-like) of features to various groups 285 286 Parameters 287 ---------- 288 features : list or np.array 289 Array-like of features 290 groups : dict 291 Dictionary defining regular expressions and colors for each group. 292 """ 293 294 # Check input 295 try: 296 features = np.asarray(features) 297 except: 298 raise ValueError('features must be array-like') 299 assert isinstance(groups, dict), "groups must be a dictionary" 300 301 # Assign features to groups 302 feature_groups = {f: ['Other'] for f in features} 303 for grp, prop in groups.items(): 304 r = np.vectorize(lambda x: bool(re.search(prop['rgx'], x))) 305 for f in features[r(features)]: 306 feature_groups[f] = [grp] 307 308 # Define colors for each group 309 group_colors = {k: v['col'] for k, v in groups.items()} 310 group_colors['Other'] = 'black' 311 312 return feature_groups, group_colors 313 314 315def parse_content(contents, filename): 316 """ 317 Parse content uploaded via a dcc.Upload component 318 """ 319 320 # Extract content 321 _, content_string = contents.split(',') 322 323 # Extract file extension 324 ext = pathlib.Path(filename).suffix 325 326 # Decode & parse data 327 decoded = base64.b64decode(content_string) 328 try: 329 if ext in ['.csv', '.tsv', '.txt']: 330 data = pd.read_csv( 331 io.StringIO(decoded.decode('utf-8')), 332 sep=None, engine='python' 333 ) 334 elif 'xls' in ext: 335 data = pd.read_excel(io.BytesIO(decoded)) 336 else: 337 raise ValueError('Unsupported file type') 338 except ValueError: 339 return {'error': ['bad-ftype']} 340 except Exception as e: 341 return {'error': ['loading-error', e.__repr__()]} 342 343 return data 344 345 346def search_opts(opts, ptn): 347 """ 348 Search a list of options for a value that matches a regex 349 """ 350 351 return np.any([ bool(re.search(ptn, x)) for x in opts ])
def
drop_high_corr(data: pandas.core.frame.DataFrame, thresh: float = 0.95):
43def drop_high_corr(data: pd.DataFrame, thresh: float=0.95): 44 """ 45 Remove features with high pearson correlations (> thresh) from a dataframe 46 """ 47 48 # Compute correlations 49 corr = data.corr('pearson').abs() 50 corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)) 51 52 # Create a dictionary to keep track of features that exhibit high 53 # correlations 54 # keys = feature to keep 55 # values = features to drop 56 dropped = dict() 57 for col in corr.columns: 58 I = corr[col] > thresh 59 if any(I): 60 dropped[col] = corr.columns[I].tolist() 61 62 # List of features to drop 63 drop = [x for sub in dropped.values() for x in sub if x not in dropped] 64 65 return data.drop(drop, axis=1), drop
Remove features with high pearson correlations (> thresh) from a dataframe
def
drop_low_variance( data: pandas.core.frame.DataFrame, thresh: float = 0.0, na_replacement: int = -999):
68def drop_low_variance(data: pd.DataFrame, thresh: float=0.0, 69 na_replacement: int=-999): 70 """ 71 Remove all low-variance features from a dataframe. 72 """ 73 74 df = data.copy() 75 selector = VarianceThreshold(thresh) 76 selector.fit(df.fillna(na_replacement)) 77 78 return df.loc[:, selector.get_support(indices=False)]
Remove all low-variance features from a dataframe.
def
printif(cond: bool, *args, **kwargs):
def
intersperse(lst, item):
def
normalize_df(df, method='MinMax'):
177def normalize_df(df, method='MinMax'): 178 """ 179 Normalize a data frame 180 """ 181 182 if method == 'MinMax': 183 X = MinMaxScaler().fit_transform(df.values) 184 elif method == 'z': 185 X = StandardScaler().fit_transform(df.values) 186 else: 187 raise ValueError("method must be one of 'MinMax' or 'z'") 188 189 return pd.DataFrame(data=X, columns=df.columns, index=df.index)
Normalize a data frame
def
dim_reduction( data, method=['PCA', 'tSNE', 'UMAP'], pca_kws=None, tsne_kws=None, umap_kws=None) -> pandas.core.frame.DataFrame:
192def dim_reduction(data, method=['PCA', 'tSNE', 'UMAP'], 193 pca_kws=None, tsne_kws=None, umap_kws=None) -> pd.DataFrame: 194 """ 195 Perfrom Dimensionality Reduction on data and Return the projections 196 197 Parameters 198 ---------- 199 data : pd.DataFrame 200 Data set. Should have any metadata columns in its index only. 201 method : str | list 202 What dimensionality reduction technique to use. 203 Must be one or / or a combination of: 'PCA', 'tSNE', 'UMAP' 204 """ 205 206 data.columns = [x.lower() for x in data.columns] 207 208 if isinstance(method, str): 209 method = [method] 210 method = [x.lower() for x in method] 211 if np.all([x not in ['pca', 'tsne', 'umap'] for x in method]): 212 raise ValueError("method must be one of 'UMAP', 'tSNE', or 'PCA'") 213 214 if pca_kws is None: 215 pca_kws = dict(n_components=5, random_state=69) 216 else: 217 pca_kws['random_state'] = 69 218 219 if tsne_kws is None: 220 tsne_kws = dict(n_components=3, perplexity=30.0, learning_rate='auto', 221 init='random', random_state=69) 222 else: 223 tsne_kws['random_state'] = 69 224 225 if umap_kws is None: 226 umap_kws = dict(n_components=3, init='random', n_neighbors=20, 227 min_dist=0.2, random_state=69) 228 else: 229 umap_kws['random_state'] = 69 230 231 # Initiazte transformers 232 transformers = dict() 233 for m in method: 234 if m == 'pca': 235 transformers[m] = PCA(**pca_kws).fit(data) 236 elif m == 'tsne': 237 transformers[m] = TSNE(**tsne_kws) 238 elif m == 'umap': 239 transformers[m] = UMAP(**umap_kws) 240 else: 241 raise ValueError("You're not supposed to be here.") 242 243 # Compute projections 244 proj = [] 245 for m, tr in transformers.items(): 246 if m == 'pca': 247 proj.append( tr.transform(data) ) 248 else: 249 proj.append( tr.fit_transform(data) ) 250 proj = np.concatenate(proj, axis=1) 251 252 # Create column names for output data frame 253 cols = [] 254 for m in method: 255 if m == 'pca': 256 n = transformers['pca'].n_components + 1 257 cols.append([f'PCA {x}' for x in range(1,n)] ) 258 else: 259 cols.append([f'{m.upper()} {x}' for x in range(1,4)]) 260 cols = [x for sub in cols for x in sub] 261 262 # Prepare dataframe 263 proj = pd.DataFrame( 264 data=proj, 265 columns=cols, 266 index=data.index 267 ).melt(ignore_index=False) 268 proj['comp'] = proj.variable.apply(lambda x: re.search(r'\d+', x)[0]).astype(int) 269 proj['variable'] = proj.variable.apply(lambda x: re.search(r'^\w*', x)[0]) 270 proj = (proj 271 .pivot_table(values='value', columns='comp', 272 index=list(data.index.names) + ['variable']) 273 .reset_index()) 274 proj.columns = proj.columns.astype(str) 275 276 if 'pca' in method: 277 exvar = transformers['pca'].explained_variance_ratio_ * 100 278 return proj, exvar 279 280 return proj
Perfrom Dimensionality Reduction on data and Return the projections
Parameters
- data (pd.DataFrame): Data set. Should have any metadata columns in its index only.
- method (str | list): What dimensionality reduction technique to use. Must be one or / or a combination of: 'PCA', 'tSNE', 'UMAP'
def
assign_groups( features, groups={'Nucleus': {'rgx': 'Nucleus|Number of Objects|HOECHST 33342', 'col': '#008AC9'}, 'TMRM': {'rgx': 'Alexa 594', 'col': '#FFC000'}, 'ROS': {'rgx': 'Alexa 488', 'col': '#70AD47'}, 'Mito Mass': {'rgx': 'Alexa 647|spots|MCTracker Deep Red', 'col': '#D75156'}}):
283def assign_groups(features, groups=groups): 284 """ 285 Assign a list (array-like) of features to various groups 286 287 Parameters 288 ---------- 289 features : list or np.array 290 Array-like of features 291 groups : dict 292 Dictionary defining regular expressions and colors for each group. 293 """ 294 295 # Check input 296 try: 297 features = np.asarray(features) 298 except: 299 raise ValueError('features must be array-like') 300 assert isinstance(groups, dict), "groups must be a dictionary" 301 302 # Assign features to groups 303 feature_groups = {f: ['Other'] for f in features} 304 for grp, prop in groups.items(): 305 r = np.vectorize(lambda x: bool(re.search(prop['rgx'], x))) 306 for f in features[r(features)]: 307 feature_groups[f] = [grp] 308 309 # Define colors for each group 310 group_colors = {k: v['col'] for k, v in groups.items()} 311 group_colors['Other'] = 'black' 312 313 return feature_groups, group_colors
Assign a list (array-like) of features to various groups
Parameters
- features (list or np.array): Array-like of features
- groups (dict): Dictionary defining regular expressions and colors for each group.
def
parse_content(contents, filename):
316def parse_content(contents, filename): 317 """ 318 Parse content uploaded via a dcc.Upload component 319 """ 320 321 # Extract content 322 _, content_string = contents.split(',') 323 324 # Extract file extension 325 ext = pathlib.Path(filename).suffix 326 327 # Decode & parse data 328 decoded = base64.b64decode(content_string) 329 try: 330 if ext in ['.csv', '.tsv', '.txt']: 331 data = pd.read_csv( 332 io.StringIO(decoded.decode('utf-8')), 333 sep=None, engine='python' 334 ) 335 elif 'xls' in ext: 336 data = pd.read_excel(io.BytesIO(decoded)) 337 else: 338 raise ValueError('Unsupported file type') 339 except ValueError: 340 return {'error': ['bad-ftype']} 341 except Exception as e: 342 return {'error': ['loading-error', e.__repr__()]} 343 344 return data
Parse content uploaded via a dcc.Upload component
def
search_opts(opts, ptn):
347def search_opts(opts, ptn): 348 """ 349 Search a list of options for a value that matches a regex 350 """ 351 352 return np.any([ bool(re.search(ptn, x)) for x in opts ])
Search a list of options for a value that matches a regex