Calculate Highly Variable Features And Get mC Fraction AnnData

Purpose

The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature’s normalized dispersion among cells.

Input

  • Filtered cell metadata;

  • MCDS files;

  • Feature list from basic feature filtering

Output

  • cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.

Import

import pandas as pd
import xarray as xr
import dask
import ALLCools
from ALLCools import MCDS
from ALLCools.clustering import feature_enrichment

Parameters

# If True, will load all data into memory.
# Computation will be much faster, but also very memory intensive, only use this for small number of cells (<10,000)
load = True

# change this to the path to your filtered metadata
metadata_path = 'CellMetadata.PassQC.csv.gz'

# change this to the paths to your MCDS files
mcds_path_list = [
    '../../../data/Brain/3C-171206.mcds',
    '../../../data/Brain/3C-171207.mcds',
    '../../../data/Brain/9H-190212.mcds',
    '../../../data/Brain/9H-190219.mcds',
]

# Feature list after basic filter
feature_path = 'FeatureList.BasicFilter.txt'

# Dimension name used to do clustering
obs_dim = 'cell'  # observation
var_dim = 'chrom100k'  # feature

# HVF method:
# SVR: regression based
# Bins: normalize dispersion per bin
hvf_method = 'SVR'
mch_pattern = 'CHN'
mcg_pattern = 'CGN'
n_top_feature = 20000

# Downsample cells
downsample = 20000

Load Data

Metadata

metadata = pd.read_csv(metadata_path, index_col=0)
total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')
Metadata of 4958 cells
metadata.head()
mCCCFrac mCGFrac mCHFrac FinalmCReads CellInputReadPairs MappingRate Plate Col384 Row384 CellTypeAnno
cell
3C_M_0 0.00738 0.75953 0.02543 1195574.0 2896392 0.625773 CEMBA171206-3C-1 18 0 IT-L23
3C_M_1 0.00938 0.77904 0.03741 1355517.0 3306366 0.631121 CEMBA171206-3C-1 18 1 IT-L5
3C_M_10 0.00915 0.82430 0.03678 2815807.0 7382298 0.657560 CEMBA171206-3C-1 21 1 L6b
3C_M_100 0.00978 0.79705 0.04231 2392650.0 5865154 0.671600 CEMBA171206-3C-1 0 3 MGE-Pvalb
3C_M_1000 0.00776 0.78781 0.02789 1922013.0 4800236 0.646285 CEMBA171206-3C-4 3 8 IT-L6
use_features = pd.read_csv(feature_path, header=None, index_col=0).index
use_features.name = var_dim

MCDS

with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    # still use all the cells to load MCDS
    total_mcds = MCDS.open(mcds_path_list,
                           obs_dim=obs_dim,
                           use_obs=metadata.index).sel({var_dim: use_features})

Add mC Rate

total_mcds.add_mc_rate(var_dim=var_dim,
                       normalize_per_cell=True,
                       clip_norm_value=10)

total_mcds
<xarray.MCDS>
Dimensions:              (cell: 4958, chrom100k: 24042, count_type: 2, gene: 55487, mc_type: 2)
Coordinates:
  * mc_type              (mc_type) object 'CGN' 'CHN'
  * cell                 (cell) object '3C_M_0' '3C_M_1' ... '9H_M_3061'
  * gene                 (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG000...
  * count_type           (count_type) object 'mc' 'cov'
    strand_type          <U4 'both'
    gene_chrom           (gene) object dask.array<chunksize=(55487,), meta=np.ndarray>
    gene_start           (gene) int64 dask.array<chunksize=(55487,), meta=np.ndarray>
    gene_end             (gene) int64 dask.array<chunksize=(55487,), meta=np.ndarray>
  * chrom100k            (chrom100k) int64 30 31 32 33 ... 26335 26336 26337
    chrom100k_chrom      (chrom100k) object dask.array<chunksize=(24042,), meta=np.ndarray>
    chrom100k_bin_start  (chrom100k) int64 dask.array<chunksize=(24042,), meta=np.ndarray>
    chrom100k_bin_end    (chrom100k) int64 dask.array<chunksize=(24042,), meta=np.ndarray>
Data variables:
    gene_da              (cell, gene, mc_type, count_type) uint16 dask.array<chunksize=(1199, 55487, 2, 2), meta=np.ndarray>
    chrom100k_da         (cell, chrom100k, mc_type, count_type) uint16 dask.array<chunksize=(1199, 24042, 2, 2), meta=np.ndarray>
    chrom100k_da_frac    (cell, chrom100k, mc_type) float64 dask.array<chunksize=(1199, 24042, 2), meta=np.ndarray>

If downsample

if downsample and total_cells > downsample:
    # make a downsampled mcds
    print(f'Downsample cells to {downsample} to calculate HVF.')
    downsample_cell_ids = metadata.sample(downsample, random_state=0).index
    mcds = total_mcds.sel(
        {obs_dim: total_mcds.get_index(obs_dim).isin(downsample_cell_ids)})
else:
    mcds = total_mcds
if load and (mcds.get_index('cell').size <= 20000):
    # load the relevant data so the computation can be fater, watch out memory!
    mcds[f'{var_dim}_da_frac'].load()
/home/hanliu/miniconda3/envs/allcools/lib/python3.8/site-packages/dask/core.py:121: RuntimeWarning: invalid value encountered in true_divide
  return func(*(_execute_task(a, cache) for a in args))

The RuntimeWarning is expected (due to cov == 0). You can ignore it.

Highly Variable Feature

mCH

if hvf_method == 'SVR':
    # use SVR based method
    mch_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mch_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=True)
else:
    # use bin based method
    mch_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mch_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.05,
                                 cov_binsize=100)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24042
Highly Variable Feature:  20000 (83.2%)

mCG

if hvf_method == 'SVR':
    # use SVR based method
    mcg_hvf = mcds.calculate_hvf_svr(var_dim=var_dim,
                                     mc_type=mcg_pattern,
                                     n_top_feature=n_top_feature,
                                     plot=True)
else:
    # use bin based method
    mcg_hvf = mcds.calculate_hvf(var_dim=var_dim,
                                 mc_type=mcg_pattern,
                                 min_mean=0,
                                 max_mean=5,
                                 n_top_feature=n_top_feature,
                                 bin_min_features=5,
                                 mean_binsize=0.02,
                                 cov_binsize=20)
Fitting SVR with gamma 0.0416, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24042
Highly Variable Feature:  20000 (83.2%)

Save AnnData

mch_adata = mcds.get_adata(mc_type=mch_pattern,
                           var_dim=var_dim,
                           select_hvf=True)

mch_adata.write_h5ad(f'mCH.HVF.h5ad')

mch_adata
... storing 'chrom' as categorical
... storing 'CHN_feature_select' as categorical
... storing 'CGN_feature_select' as categorical
AnnData object with n_obs × n_vars = 4958 × 20000
    var: 'chrom', 'bin_start', 'bin_end', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select', 'CGN_mean', 'CGN_dispersion', 'CGN_cov', 'CGN_score', 'CGN_feature_select'
mcg_adata = mcds.get_adata(mc_type=mcg_pattern,
                           var_dim=var_dim,
                           select_hvf=True)

mcg_adata.write_h5ad(f'mCG.HVF.h5ad')

mcg_adata
... storing 'chrom' as categorical
... storing 'CHN_feature_select' as categorical
... storing 'CGN_feature_select' as categorical
AnnData object with n_obs × n_vars = 4958 × 20000
    var: 'chrom', 'bin_start', 'bin_end', 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select', 'CGN_mean', 'CGN_dispersion', 'CGN_cov', 'CGN_score', 'CGN_feature_select'