Calculate Gene mC Fractions¶
import pandas as pd
import scanpy as sc
import anndata
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
import pybedtools
import dask
from ALLCools.plot import *
from ALLCools.mcds.MCDS import MCDS
import pathlib
import numpy as np
gene_meta_path = '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz'
chrom_to_remove = ['chrM']
# change this to the path to your filtered metadata
metadata_path = 'CellMetadata.PassQC.csv.gz'
# change this to the paths to your MCDS files
mcds_path_list = [
'../Data/3C-171206.mcds',
'../Data/3C-171207.mcds',
'../Data/9H-190212.mcds',
'../Data/9H-190219.mcds',
]
obs_dim = 'cell'
var_dim = 'gene'
min_cov = 5
Load metadata¶
gene_meta = pd.read_csv(gene_meta_path, index_col='gene_id', sep='\t')
metadata = pd.read_csv(metadata_path, index_col=0)
total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')
Metadata of 4958 cells
Filter genes by overlap and chromosomes¶
genes_to_skip = set()
# skip smaller genes mostly covered by a larger gene, e.g., a miRNA within a protein coding gene.
# F=0.9 means > 90% of gene_b is overlapped with gene_a, in this case, we only keep gene_a for DMG test
gene_bed = pybedtools.BedTool.from_dataframe(
gene_meta.reset_index()[['chrom', 'start', 'end', 'gene_id']])
mapped_bam = gene_bed.map(b=gene_bed, c=4, o='distinct', F=0.9)
for _, (*_, gene_a, gene_b_str) in mapped_bam.to_dataframe().iterrows():
for gene_b in gene_b_str.split(','):
if gene_b != gene_a:
genes_to_skip.add(gene_b)
# remove certain chromosomes
genes_to_skip |= set(gene_meta.index[gene_meta['chrom'].isin(chrom_to_remove)])
use_features = gene_meta.index[~gene_meta.index.isin(genes_to_skip)]
print(f'{use_features.size} features remained')
/home/hanliu/miniconda3/envs/allcools/lib/python3.8/subprocess.py:849: RuntimeWarning: line buffering (buffering=1) isn't supported in binary mode, the default buffer size will be used
self.stderr = io.open(errread, 'rb', bufsize)
41871 features remained
Filter genes by cell mean coverage¶
with dask.config.set(**{'array.slicing.split_large_chunks': False}):
# still use all the cells to load MCDS
mcds = MCDS.open(mcds_path_list, obs_dim=obs_dim,
use_obs=metadata.index).sel({var_dim: use_features})
mcds.add_feature_cov_mean(var_dim=var_dim)
feature_cov_mean = mcds.coords[f'{var_dim}_cov_mean'].to_pandas()
use_features &= feature_cov_mean[feature_cov_mean > min_cov].index
print(f'{use_features.size} features remained')
Feature gene mean cov across cells added in MCDS.coords['gene_cov_mean'].
35664 features remained

mcds.filter_feature_by_cov_mean(var_dim, min_cov=min_cov)
Before cov mean filter: 41871 gene
After cov mean filter: 35664 gene 85.2%
<xarray.MCDS> Dimensions: (cell: 4958, chrom100k: 27269, count_type: 2, gene: 35664, mc_type: 2) Coordinates: * mc_type (mc_type) object 'CGN' 'CHN' * cell (cell) object '3C_M_0' '3C_M_1' ... '9H_M_3061' * gene (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG000... * count_type (count_type) object 'mc' 'cov' strand_type <U4 'both' gene_chrom (gene) object dask.array<chunksize=(35664,), meta=np.ndarray> gene_start (gene) int64 dask.array<chunksize=(35664,), meta=np.ndarray> gene_end (gene) int64 dask.array<chunksize=(35664,), meta=np.ndarray> * chrom100k (chrom100k) int64 0 1 2 3 4 ... 27265 27266 27267 27268 chrom100k_chrom (chrom100k) object dask.array<chunksize=(27269,), meta=np.ndarray> chrom100k_bin_start (chrom100k) int64 dask.array<chunksize=(27269,), meta=np.ndarray> chrom100k_bin_end (chrom100k) int64 dask.array<chunksize=(27269,), meta=np.ndarray> gene_cov_mean (gene) float64 13.71 6.909e+03 ... 38.53 1.328e+03 Data variables: gene_da (cell, gene, mc_type, count_type) uint16 dask.array<chunksize=(1199, 35664, 2, 2), meta=np.ndarray> chrom100k_da (cell, chrom100k, mc_type, count_type) uint16 dask.array<chunksize=(1199, 27269, 2, 2), meta=np.ndarray>
xarray.MCDS
- cell: 4958
- chrom100k: 27269
- count_type: 2
- gene: 35664
- mc_type: 2
- mc_type(mc_type)object'CGN' 'CHN'
array(['CGN', 'CHN'], dtype=object)
- cell(cell)object'3C_M_0' '3C_M_1' ... '9H_M_3061'
array(['3C_M_0', '3C_M_1', '3C_M_10', ..., '9H_M_3057', '9H_M_3059', '9H_M_3061'], dtype=object)
- gene(gene)object'ENSMUSG00000102693.1' ... 'ENSM...
array(['ENSMUSG00000102693.1', 'ENSMUSG00000051951.5', 'ENSMUSG00000102348.1', ..., 'ENSMUSG00000100964.1', 'ENSMUSG00000095134.2', 'ENSMUSG00000096768.8'], dtype=object)
- count_type(count_type)object'mc' 'cov'
array(['mc', 'cov'], dtype=object)
- strand_type()<U4'both'
array('both', dtype='<U4')
- gene_chrom(gene)objectdask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 17 Tasks 1 Chunks Type object numpy.ndarray - gene_start(gene)int64dask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 15 Tasks 1 Chunks Type int64 numpy.ndarray - gene_end(gene)int64dask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 15 Tasks 1 Chunks Type int64 numpy.ndarray - chrom100k(chrom100k)int640 1 2 3 ... 27265 27266 27267 27268
array([ 0, 1, 2, ..., 27266, 27267, 27268])
- chrom100k_chrom(chrom100k)objectdask.array<chunksize=(27269,), meta=np.ndarray>
Array Chunk Bytes 218.15 kB 218.15 kB Shape (27269,) (27269,) Count 15 Tasks 1 Chunks Type object numpy.ndarray - chrom100k_bin_start(chrom100k)int64dask.array<chunksize=(27269,), meta=np.ndarray>
Array Chunk Bytes 218.15 kB 218.15 kB Shape (27269,) (27269,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray - chrom100k_bin_end(chrom100k)int64dask.array<chunksize=(27269,), meta=np.ndarray>
Array Chunk Bytes 218.15 kB 218.15 kB Shape (27269,) (27269,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray - gene_cov_mean(gene)float6413.71 6.909e+03 ... 38.53 1.328e+03
array([ 13.70895522, 6908.55022186, 23.27349738, ..., 14.97277128, 38.52622025, 1328.03529649])
- gene_da(cell, gene, mc_type, count_type)uint16dask.array<chunksize=(1199, 35664, 2, 2), meta=np.ndarray>
Array Chunk Bytes 1.41 GB 374.90 MB Shape (4958, 35664, 2, 2) (1314, 35664, 2, 2) Count 30 Tasks 4 Chunks Type uint16 numpy.ndarray - chrom100k_da(cell, chrom100k, mc_type, count_type)uint16dask.array<chunksize=(1199, 27269, 2, 2), meta=np.ndarray>
Array Chunk Bytes 1.08 GB 286.65 MB Shape (4958, 27269, 2, 2) (1314, 27269, 2, 2) Count 22 Tasks 4 Chunks Type uint16 numpy.ndarray
Add Gene mC Fraction per MCDS file¶
gene_frac_dir = pathlib.Path('gene_frac')
gene_frac_dir.mkdir(exist_ok=True)
for mcds_path in mcds_path_list:
output_path = gene_frac_dir / (pathlib.Path(mcds_path).name + f'{var_dim}_da_rate.nc')
if output_path.exists():
continue
print(f'Computaing gene mC fraction for {mcds_path}')
mcds = MCDS.open(mcds_path, obs_dim=obs_dim)
# remove non-related data
del_das = []
for da in mcds:
if da != f'{var_dim}_da':
del_das.append(da)
for da in del_das:
del mcds[da]
mcds.load()
mcds = mcds.sel({var_dim: use_features})
mcds.add_mc_rate(var_dim=var_dim, normalize_per_cell=True, clip_norm_value=10)
# use float32 to reduce file size and speedup IO
mcds = mcds.rename({var_dim: 'gene', f'{var_dim}_da_rate': 'gene_da_rate'})
mcds['gene_da_rate'].astype('float32').to_netcdf(output_path)
Save gene metadata together with gene fraction files¶
use_gene_meta = gene_meta.loc[use_features]
use_gene_meta.to_csv(gene_frac_dir / 'GeneMetadata.csv.gz')
How to create adata from gene mC frac files¶
Create xr.DataArray (unloaded)¶
# gene_frac_paths = gene_frac_dir.glob('*_da_rate.nc')
# gene_frac_da = xr.open_mfdataset(gene_frac_paths, concat_dim='cell', combine='nested')['gene_da_rate']
# gene_frac_da
<xarray.DataArray 'gene_da_rate' (cell: 5127, gene: 35664, mc_type: 2)> dask.array<concatenate, shape=(5127, 35664, 2), dtype=float32, chunksize=(1404, 35664, 2), chunktype=numpy.ndarray> Coordinates: * mc_type (mc_type) object 'CGN' 'CHN' * cell (cell) object '3C_M_1015' '3C_M_0' ... '9H_M_3055' '9H_M_3057' * gene (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG00000096768.8' strand_type <U4 'both' gene_chrom (gene) object dask.array<chunksize=(35664,), meta=np.ndarray> gene_start (gene) int64 dask.array<chunksize=(35664,), meta=np.ndarray> gene_end (gene) int64 dask.array<chunksize=(35664,), meta=np.ndarray>
xarray.DataArray
'gene_da_rate'
- cell: 5127
- gene: 35664
- mc_type: 2
- dask.array<chunksize=(1202, 35664, 2), meta=np.ndarray>
Array Chunk Bytes 1.46 GB 400.58 MB Shape (5127, 35664, 2) (1404, 35664, 2) Count 14 Tasks 4 Chunks Type float32 numpy.ndarray - mc_type(mc_type)object'CGN' 'CHN'
array(['CGN', 'CHN'], dtype=object)
- cell(cell)object'3C_M_1015' ... '9H_M_3057'
array(['3C_M_1015', '3C_M_0', '3C_M_1005', ..., '9H_M_3061', '9H_M_3055', '9H_M_3057'], dtype=object)
- gene(gene)object'ENSMUSG00000102693.1' ... 'ENSM...
array(['ENSMUSG00000102693.1', 'ENSMUSG00000051951.5', 'ENSMUSG00000102348.1', ..., 'ENSMUSG00000100964.1', 'ENSMUSG00000095134.2', 'ENSMUSG00000096768.8'], dtype=object)
- strand_type()<U4'both'
array('both', dtype='<U4')
- gene_chrom(gene)objectdask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 15 Tasks 1 Chunks Type object numpy.ndarray - gene_start(gene)int64dask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray - gene_end(gene)int64dask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray
Create anndata.AnnData (only load used cells)¶
# use_metadata = metadata.iloc[:100, :]
#
# example_adata = anndata.AnnData(
# X=gene_frac_da.sel({
# obs_dim: use_metadata.index,
# 'mc_type': 'CHN'
# }).values,
# obs=use_metadata,
# var=use_gene_meta
# )
#
# example_adata
AnnData object with n_obs × n_vars = 100 × 35664
obs: 'AllcPath', 'CCC_Rate', 'CG_Rate', 'CG_RateAdj', 'CH_Rate', 'CH_RateAdj', 'FinalReads', 'InputReads', 'MappedReads', 'Region', 'index_name', 'uid', 'BamFilteringRate', 'MappingRate', 'Pos96', 'Plate', 'Col96', 'Row96', 'Col384', 'Row384', 'FACS_Date', 'Slice'
var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag'