Differential Methylated Genes - Pairwise¶
import pandas as pd
import scanpy as sc
import anndata
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
import pybedtools
import dask
from ALLCools.plot import *
from ALLCools.mcds.MCDS import MCDS
from ALLCools.clustering import PairwiseDMG, aggregate_pairwise_dmg
import pathlib
import numpy as np
from itertools import combinations
import warnings
from sklearn.metrics import roc_auc_score
from concurrent.futures import ProcessPoolExecutor, as_completed
gene_meta_path = '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz'
chrom_to_remove = ['chrM']
adata_path = 'adata.coord_only.h5ad'
clustering_name = 'L1'
# change this to the path to your filtered metadata
metadata_path = 'CellMetadata.PassQC.csv.gz'
# change this to the paths to your MCDS files
gene_fraction_dir = 'gene_frac/'
obs_dim = 'cell'
# DMG
mc_type = 'CHN'
min_cluster_cell_number = 10
top_n = 30000
max_cell_per_group = 1000
chunk_size = 100
cpu = 10
random_state = 0
adj_p_cutoff = 1e-3
delta_rate_cutoff = 0.3
auroc_cutoff = 0.9
n_jobs = 30
Load¶
Gene Fraction Data¶
gene_fraction_dir = pathlib.Path(gene_fraction_dir)
gene_meta = pd.read_csv('GeneMetadata.csv.gz', index_col=0)
gene_meta.index.name = 'gene_id'
gene_frac_da = xr.open_mfdataset(f'{gene_fraction_dir}/*_da_rate.nc',
concat_dim='cell',
combine='nested')[f'gene_da_rate']
# standardize names
gene_frac_da
<xarray.DataArray 'gene_da_rate' (cell: 5127, gene: 35664, mc_type: 2)> dask.array<concatenate, shape=(5127, 35664, 2), dtype=float32, chunksize=(1404, 35664, 2), chunktype=numpy.ndarray> Coordinates: * mc_type (mc_type) object 'CGN' 'CHN' * cell (cell) object '3C_M_1015' '3C_M_0' ... '9H_M_3055' '9H_M_3057' * gene (gene) object 'ENSMUSG00000102693.1' ... 'ENSMUSG00000096768.8' strand_type <U4 'both' gene_chrom (gene) object dask.array<chunksize=(35664,), meta=np.ndarray> gene_start (gene) int64 dask.array<chunksize=(35664,), meta=np.ndarray> gene_end (gene) int64 dask.array<chunksize=(35664,), meta=np.ndarray>
xarray.DataArray
'gene_da_rate'
- cell: 5127
- gene: 35664
- mc_type: 2
- dask.array<chunksize=(1202, 35664, 2), meta=np.ndarray>
Array Chunk Bytes 1.46 GB 400.58 MB Shape (5127, 35664, 2) (1404, 35664, 2) Count 14 Tasks 4 Chunks Type float32 numpy.ndarray - mc_type(mc_type)object'CGN' 'CHN'
array(['CGN', 'CHN'], dtype=object)
- cell(cell)object'3C_M_1015' ... '9H_M_3057'
array(['3C_M_1015', '3C_M_0', '3C_M_1005', ..., '9H_M_3061', '9H_M_3055', '9H_M_3057'], dtype=object)
- gene(gene)object'ENSMUSG00000102693.1' ... 'ENSM...
array(['ENSMUSG00000102693.1', 'ENSMUSG00000051951.5', 'ENSMUSG00000102348.1', ..., 'ENSMUSG00000100964.1', 'ENSMUSG00000095134.2', 'ENSMUSG00000096768.8'], dtype=object)
- strand_type()<U4'both'
array('both', dtype='<U4')
- gene_chrom(gene)objectdask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 15 Tasks 1 Chunks Type object numpy.ndarray - gene_start(gene)int64dask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray - gene_end(gene)int64dask.array<chunksize=(35664,), meta=np.ndarray>
Array Chunk Bytes 285.31 kB 285.31 kB Shape (35664,) (35664,) Count 13 Tasks 1 Chunks Type int64 numpy.ndarray
Clustering Data¶
metadata = pd.read_csv(metadata_path, index_col=0)
total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')
Metadata of 4958 cells
adata = anndata.read_h5ad(adata_path)
adata
AnnData object with n_obs × n_vars = 4958 × 3465
obs: 'pre_clusters', 'CH_Rate', 'tsne_0', 'tsne_1', 'umap_0', 'umap_1', 'L1', 'L1_proba'
var: 'chrom100k_chrom', 'chrom100k_bin_start', 'chrom100k_bin_end', 'mean', 'dispersion', 'cov', 'score', 'feature_select', 'pre_clusters_enriched'
uns: 'neighbors', 'paga', 'pre_clusters_sizes', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances'
Pairwise DMG¶
pwdmg = PairwiseDMG(max_cell_per_group=max_cell_per_group,
top_n=top_n,
adj_p_cutoff=adj_p_cutoff,
delta_rate_cutoff=delta_rate_cutoff,
auroc_cutoff=auroc_cutoff,
random_state=random_state,
n_jobs=n_jobs)
pwdmg.fit_predict(x=gene_frac_da.sel(mc_type=mc_type, cell=adata.obs_names),
groups=adata.obs[clustering_name],
cleanup=True,
outlier='Outlier')
Generating cluster AnnData files
/home/hanliu/miniconda3/envs/allcools/lib/python3.8/site-packages/xarray/core/indexing.py:1361: PerformanceWarning: Slicing with an out-of-order index is generating 135 times more chunks
return self.array[key]
/home/hanliu/miniconda3/envs/allcools/lib/python3.8/site-packages/xarray/core/indexing.py:1361: PerformanceWarning: Slicing with an out-of-order index is generating 124 times more chunks
return self.array[key]
Computing pairwise DMG
210 pairwise DMGs
1/210 finished
11/210 finished
21/210 finished
31/210 finished
41/210 finished
51/210 finished
61/210 finished
71/210 finished
81/210 finished
91/210 finished
101/210 finished
111/210 finished
121/210 finished
131/210 finished
141/210 finished
151/210 finished
161/210 finished
171/210 finished
181/210 finished
191/210 finished
201/210 finished
pwdmg.dmg_table.to_hdf(f'PairwiseDMG.{mc_type}.hdf', key='data')
Cluster DMG¶
cluster_dmgs = aggregate_pairwise_dmg(pwdmg.dmg_table, adata, groupby=clustering_name)
with pd.HDFStore(f'ClusterRankedDMG.{mc_type}.hdf') as hdf:
for cluster, dmgs in cluster_dmgs.items():
hdf[cluster] = dmgs