hcitools.preprocess

This module contains functions for data preprocessing

  1"""
  2This module contains functions for data preprocessing
  3"""
  4
  5# Imports
  6from sklearn.preprocessing import StandardScaler, MinMaxScaler
  7from sklearn.feature_selection import VarianceThreshold
  8from rich import print
  9
 10import pandas as pd
 11import numpy as np
 12import re
 13
 14
 15def drop_high_corr(data, thresh=0.95, method='pearson'):
 16    """
 17    Remove features with correaltions above a threshold from a data frame.
 18
 19    Parameters
 20    ----------
 21    `data` : pd.DataFrame
 22        Original data frame
 23    `thresh` : float, optional
 24        Correlation threshold, by default 0.95
 25    `method` : str, optional
 26        Either 'pearson' or 'spearman', by default 'pearson'
 27
 28    Returns
 29    -------
 30    pd.DataFrame
 31        Data frame without highly correlated features
 32    dict
 33        Keys = features still in data frame
 34        Values = list of highly correlated features
 35    """
 36
 37    assert 0 < thresh <= 1, "thresh must be between 0 and 1"
 38    assert method in ['pearson', 'spearman'], \
 39        "Only 'pearson' or 'spearman' allowed"
 40
 41    # Compute correlations
 42    corr = data.corr(method).abs()
 43    corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
 44
 45    # Create dictionary of features to drop
 46    dropped = dict()
 47    for col in corr.columns:
 48        I = corr[col] > thresh
 49        if any(I):
 50            dropped[col] = corr.columns[I].tolist()
 51
 52    dropping = [x for sub in dropped.values() for x in sub if x not in dropped]
 53
 54    return data.drop(dropping, axis=1), dropped
 55
 56
 57def drop_low_variance(data, thresh=0.0, na_replacement=-999):
 58    """
 59    Remove low-variance features from a data frame
 60
 61    Parameters
 62    ----------
 63    `data` : pd.DataFrame
 64        Original data frame
 65    `thresh` : float, optional
 66        Variance threshold, by default 0.0
 67    `na_replacement` : int, optional
 68        Replacement value for NAs, by default -999
 69
 70    Returns
 71    -------
 72    pd.DataFrame
 73        Data frame without low-variance features
 74    """
 75
 76    df = data.copy()
 77    selector = VarianceThreshold(thresh)
 78    selector.fit(df.fillna(na_replacement))
 79
 80    return df.loc[:, selector.get_support(indices=False)]
 81
 82
 83def _printif(cond, *args, **kwargs):
 84    """
 85    Print if cond is true
 86    """
 87
 88    if cond: print(*args, **kwargs)
 89
 90
 91def _intersperse(array, item):
 92    """
 93    Insert item between each item in an array
 94    """
 95
 96    result = [item] * (len(array) * 2 - 1)
 97    result[0::2] = array
 98    return result
 99
100
101def clean_data(
102    data, 
103    metacols, 
104    dropna=False, 
105    drop_low_var=None, 
106    corr_thresh=None,
107    corr_method='pearson',
108    intens_norm=False, 
109    intens_rgx=r'Intensity',
110    num_objs='number of objects', 
111    verbose=False
112):
113    """
114    Perform preprocessing steps on a high-content imaging data
115
116    Parameters
117    ----------
118    `data` : pd.DataFrame
119        Original data frame
120    `metacols` : list
121        List of non-numeric columns in data frame
122    `dropna` : bool, optional
123        Drop NA-only columns and any rows with NAs, by default False
124    `drop_low_var` : float, optional
125        Threshold for dropping low variance features, by default None
126    `corr_thresh` : float, optional
127        Threshold for dropping highly correlated features, by default None
128    `corr_method` : str, optional
129        Correlation method, by default 'spearman'
130    `intens_norm` : bool, optional
131        Should intensity-based features be normalized, by default False
132    `intens_rgx` : str, optional
133        Regular expression for identifying intensity based features, 
134        by default `r'Intensity'`
135    `num_objs` : str, optional
136        Feature definining object counts, by default 'number of objects'
137    `verbose` : bool, optional
138        Should a log of processing steps be returned, by default False
139    
140    Returns
141    -------
142    pd.DataFrame
143        Preprocessed data 
144    list
145        Only if `verbose == True`, preprocessing log
146    """
147
148    for col in metacols:
149        assert col in data.columns, "Items in metacols must be columns of data"
150    if intens_norm:
151        assert num_objs in data.columns, "num_objs must be aa column of data"
152
153    # Create log
154    if verbose:
155        LOG = [f"Original data shape: {data.shape}"]
156    else:
157        LOG = None
158
159    # Store non-numeric data in index
160    data = data.set_index(metacols)
161
162    # Track dropped features
163    og_features = set(data.columns)
164    dropped = dict()
165
166    if dropna:
167        data = (data
168            .dropna(axis=1, how='all')   # NA-only columns
169            .dropna(axis=0, how='any'))  # Rows with NAs
170        dropped['dropna'] = list(og_features.difference(data.columns))
171
172        if verbose:
173            LOG.append(
174                f"After removing missing data, data shape: {data.shape}"
175            )
176
177    if drop_low_var is not None:
178        data = drop_low_variance(data, thresh=drop_low_var)
179        dropped['low_var'] = list(og_features.difference(data.columns))
180
181        if verbose:
182            LOG.append(
183                f"After removing low-variance features, data shape: {data.shape}"
184            )
185
186    if corr_thresh is not None:
187        data, dropped['high_corr'] = drop_high_corr(data, corr_thresh, corr_method)
188
189        if verbose:
190            LOG.append(
191                f"After removing highly correlated features, data.shape: {data.shape}"
192            )
193
194    if intens_norm:
195        intens_features = [x for x in data.columns if re.search(intens_rgx, x)]
196        data[intens_features] = (data[intens_features]
197            .div(data[num_objs], axis=0))
198
199        if verbose:
200            LOG.append(
201                f"Intensity-based features were normalized by '{num_objs}'"
202            )
203
204    # Store additional metadata
205    data = data.reset_index()
206    data.attrs['metacols'] = metacols
207    data.attrs['features'] = list(set(data.columns).difference(metacols))
208    
209    return data, dropped, LOG
210
211
212def normalize(df, method='minmax'):
213    """
214    Normalize a data frame
215
216    Parameters
217    ----------
218    df : pd.DataFrame
219        Original data frame
220    method : str, optional
221        Either 'minmax' or 'z', by default 'minmax'
222
223    Returns
224    -------
225    pd.DataFrame
226        Normalized data frame
227
228    Raises
229    ------
230    NotImplementedError
231        If method isn't 'minmax' or 'z'
232    """
233
234    if method == 'minmax':
235        X = MinMaxScaler().fit_transform(df.values)
236    elif method == 'z':
237        X = StandardScaler().fit_transform(df.values)
238    else:
239        raise NotImplementedError("Can't do that yet.")
240
241    return pd.DataFrame(X, columns=df.columns, index=df.index)
def drop_high_corr(data, thresh=0.95, method='pearson'):
16def drop_high_corr(data, thresh=0.95, method='pearson'):
17    """
18    Remove features with correaltions above a threshold from a data frame.
19
20    Parameters
21    ----------
22    `data` : pd.DataFrame
23        Original data frame
24    `thresh` : float, optional
25        Correlation threshold, by default 0.95
26    `method` : str, optional
27        Either 'pearson' or 'spearman', by default 'pearson'
28
29    Returns
30    -------
31    pd.DataFrame
32        Data frame without highly correlated features
33    dict
34        Keys = features still in data frame
35        Values = list of highly correlated features
36    """
37
38    assert 0 < thresh <= 1, "thresh must be between 0 and 1"
39    assert method in ['pearson', 'spearman'], \
40        "Only 'pearson' or 'spearman' allowed"
41
42    # Compute correlations
43    corr = data.corr(method).abs()
44    corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
45
46    # Create dictionary of features to drop
47    dropped = dict()
48    for col in corr.columns:
49        I = corr[col] > thresh
50        if any(I):
51            dropped[col] = corr.columns[I].tolist()
52
53    dropping = [x for sub in dropped.values() for x in sub if x not in dropped]
54
55    return data.drop(dropping, axis=1), dropped

Remove features with correaltions above a threshold from a data frame.

Parameters
  • data (pd.DataFrame): Original data frame
  • thresh (float, optional): Correlation threshold, by default 0.95
  • method (str, optional): Either 'pearson' or 'spearman', by default 'pearson'
Returns
  • pd.DataFrame: Data frame without highly correlated features
  • dict: Keys = features still in data frame Values = list of highly correlated features
def drop_low_variance(data, thresh=0.0, na_replacement=-999):
58def drop_low_variance(data, thresh=0.0, na_replacement=-999):
59    """
60    Remove low-variance features from a data frame
61
62    Parameters
63    ----------
64    `data` : pd.DataFrame
65        Original data frame
66    `thresh` : float, optional
67        Variance threshold, by default 0.0
68    `na_replacement` : int, optional
69        Replacement value for NAs, by default -999
70
71    Returns
72    -------
73    pd.DataFrame
74        Data frame without low-variance features
75    """
76
77    df = data.copy()
78    selector = VarianceThreshold(thresh)
79    selector.fit(df.fillna(na_replacement))
80
81    return df.loc[:, selector.get_support(indices=False)]

Remove low-variance features from a data frame

Parameters
  • data (pd.DataFrame): Original data frame
  • thresh (float, optional): Variance threshold, by default 0.0
  • na_replacement (int, optional): Replacement value for NAs, by default -999
Returns
  • pd.DataFrame: Data frame without low-variance features
def clean_data( data, metacols, dropna=False, drop_low_var=None, corr_thresh=None, corr_method='pearson', intens_norm=False, intens_rgx='Intensity', num_objs='number of objects', verbose=False):
102def clean_data(
103    data, 
104    metacols, 
105    dropna=False, 
106    drop_low_var=None, 
107    corr_thresh=None,
108    corr_method='pearson',
109    intens_norm=False, 
110    intens_rgx=r'Intensity',
111    num_objs='number of objects', 
112    verbose=False
113):
114    """
115    Perform preprocessing steps on a high-content imaging data
116
117    Parameters
118    ----------
119    `data` : pd.DataFrame
120        Original data frame
121    `metacols` : list
122        List of non-numeric columns in data frame
123    `dropna` : bool, optional
124        Drop NA-only columns and any rows with NAs, by default False
125    `drop_low_var` : float, optional
126        Threshold for dropping low variance features, by default None
127    `corr_thresh` : float, optional
128        Threshold for dropping highly correlated features, by default None
129    `corr_method` : str, optional
130        Correlation method, by default 'spearman'
131    `intens_norm` : bool, optional
132        Should intensity-based features be normalized, by default False
133    `intens_rgx` : str, optional
134        Regular expression for identifying intensity based features, 
135        by default `r'Intensity'`
136    `num_objs` : str, optional
137        Feature definining object counts, by default 'number of objects'
138    `verbose` : bool, optional
139        Should a log of processing steps be returned, by default False
140    
141    Returns
142    -------
143    pd.DataFrame
144        Preprocessed data 
145    list
146        Only if `verbose == True`, preprocessing log
147    """
148
149    for col in metacols:
150        assert col in data.columns, "Items in metacols must be columns of data"
151    if intens_norm:
152        assert num_objs in data.columns, "num_objs must be aa column of data"
153
154    # Create log
155    if verbose:
156        LOG = [f"Original data shape: {data.shape}"]
157    else:
158        LOG = None
159
160    # Store non-numeric data in index
161    data = data.set_index(metacols)
162
163    # Track dropped features
164    og_features = set(data.columns)
165    dropped = dict()
166
167    if dropna:
168        data = (data
169            .dropna(axis=1, how='all')   # NA-only columns
170            .dropna(axis=0, how='any'))  # Rows with NAs
171        dropped['dropna'] = list(og_features.difference(data.columns))
172
173        if verbose:
174            LOG.append(
175                f"After removing missing data, data shape: {data.shape}"
176            )
177
178    if drop_low_var is not None:
179        data = drop_low_variance(data, thresh=drop_low_var)
180        dropped['low_var'] = list(og_features.difference(data.columns))
181
182        if verbose:
183            LOG.append(
184                f"After removing low-variance features, data shape: {data.shape}"
185            )
186
187    if corr_thresh is not None:
188        data, dropped['high_corr'] = drop_high_corr(data, corr_thresh, corr_method)
189
190        if verbose:
191            LOG.append(
192                f"After removing highly correlated features, data.shape: {data.shape}"
193            )
194
195    if intens_norm:
196        intens_features = [x for x in data.columns if re.search(intens_rgx, x)]
197        data[intens_features] = (data[intens_features]
198            .div(data[num_objs], axis=0))
199
200        if verbose:
201            LOG.append(
202                f"Intensity-based features were normalized by '{num_objs}'"
203            )
204
205    # Store additional metadata
206    data = data.reset_index()
207    data.attrs['metacols'] = metacols
208    data.attrs['features'] = list(set(data.columns).difference(metacols))
209    
210    return data, dropped, LOG

Perform preprocessing steps on a high-content imaging data

Parameters
  • data (pd.DataFrame): Original data frame
  • metacols (list): List of non-numeric columns in data frame
  • dropna (bool, optional): Drop NA-only columns and any rows with NAs, by default False
  • drop_low_var (float, optional): Threshold for dropping low variance features, by default None
  • corr_thresh (float, optional): Threshold for dropping highly correlated features, by default None
  • corr_method (str, optional): Correlation method, by default 'spearman'
  • intens_norm (bool, optional): Should intensity-based features be normalized, by default False
  • intens_rgx (str, optional): Regular expression for identifying intensity based features, by default r'Intensity'
  • num_objs (str, optional): Feature definining object counts, by default 'number of objects'
  • verbose (bool, optional): Should a log of processing steps be returned, by default False
Returns
  • pd.DataFrame: Preprocessed data
  • list: Only if verbose == True, preprocessing log
def normalize(df, method='minmax'):
213def normalize(df, method='minmax'):
214    """
215    Normalize a data frame
216
217    Parameters
218    ----------
219    df : pd.DataFrame
220        Original data frame
221    method : str, optional
222        Either 'minmax' or 'z', by default 'minmax'
223
224    Returns
225    -------
226    pd.DataFrame
227        Normalized data frame
228
229    Raises
230    ------
231    NotImplementedError
232        If method isn't 'minmax' or 'z'
233    """
234
235    if method == 'minmax':
236        X = MinMaxScaler().fit_transform(df.values)
237    elif method == 'z':
238        X = StandardScaler().fit_transform(df.values)
239    else:
240        raise NotImplementedError("Can't do that yet.")
241
242    return pd.DataFrame(X, columns=df.columns, index=df.index)

Normalize a data frame

Parameters
  • df (pd.DataFrame): Original data frame
  • method (str, optional): Either 'minmax' or 'z', by default 'minmax'
Returns
  • pd.DataFrame: Normalized data frame
Raises
  • NotImplementedError: If method isn't 'minmax' or 'z'