Source code for dandelion.preprocessing.external._preprocessing

#!/usr/bin/env python
# @Author: kt16
# @Date:   2020-05-12 17:56:02
# @Last Modified by:   Kelvin
# @Last Modified time: 2021-02-21 13:10:05

import os
import pandas as pd
import numpy as np
from subprocess import run
from datetime import timedelta
from anndata import AnnData
from time import time
from collections import OrderedDict
from time import time
from ...utilities._utilities import *
import scanpy as sc
import scipy.stats
import re
from os import PathLike
from typing import Union, Sequence, Tuple


[docs]def assigngenes_igblast(fasta: Union[str, PathLike], igblast_db: Union[None, str] = None, org: Literal['human', 'mouse'] = 'human', loci: Literal['ig', 'tr'] = 'ig', verbose: bool = False): """ Reannotate with IgBLASTn. Parameters ---------- fasta : PathLike fasta file for reannotation. igblast_db : PathLike, optional path to igblast database. org : str organism for germline sequences. loci : str `ig` or `tr` mode for running igblastn. verbose : bool whether or not to print the command used in terminal. Default is False. """ env = os.environ.copy() if igblast_db is None: try: igdb = env['IGDATA'] except: raise OSError( 'Environmental variable IGDATA must be set. Otherwise, please provide path to igblast database') else: env['IGDATA'] = igblast_db igdb = env['IGDATA'] outfolder = os.path.abspath(os.path.dirname(fasta))+'/tmp' informat_dict = {'blast': '_igblast.fmt7', 'airr': '_igblast.tsv'} if not os.path.exists(outfolder): os.makedirs(outfolder) for fileformat in ['blast', 'airr']: outfile = os.path.basename(fasta).split( '.fasta')[0] + informat_dict[fileformat] cmd = ['AssignGenes.py', 'igblast', '-s', fasta, '-b', igdb, '--organism', org, '--loci', loci, '--format', fileformat, '-o', "{}/{}".format(outfolder, outfile) ] if verbose: print('Running command: %s\n' % (' '.join(cmd))) run(cmd, env=env) # logs are printed to terminal
[docs]def makedb_igblast(fasta: Union[str, PathLike], igblast_output: Union[None, str, PathLike] = None, germline: Union[None, str, PathLike] = None, org: Literal['human', 'mouse'] = 'human', extended: bool = True, verbose: bool = False): """ Parses IgBLAST output to airr format. Parameters ---------- fasta : PathLike fasta file use for reannotation. igblast_output : PathLike, optional igblast output file. germline : PathLike, optional path to germline database. org : str organism of germline sequences. extended : bool whether or not to parse extended 10x annotations. Default is True. verbose : bool whether or not to print the command used in terminal. Default is False. """ env = os.environ.copy() if germline is None: try: gml = env['GERMLINE'] except: raise OSError( 'Environmental variable GERMLINE must be set. Otherwise, please provide path to folder containing germline fasta files.') gml = gml+'imgt/'+org+'/vdj/' else: env['GERMLINE'] = germline gml = germline if igblast_output is None: indir = os.path.dirname(fasta)+'/tmp' infile = os.path.basename(fasta).split('.fasta')[0] + '_igblast.fmt7' igbo = "{}/{}".format(indir, infile) else: igbo = igblast_output cellranger_annotation = "{}/{}".format(os.path.dirname( fasta), os.path.basename(fasta).replace('.fasta', '_annotations.csv')) if extended: cmd = ['MakeDb.py', 'igblast', '-i', igbo, '-s', fasta, '-r', gml, '--10x', cellranger_annotation, '--extended'] else: cmd = ['MakeDb.py', 'igblast', '-i', igbo, '-s', fasta, '-r', gml, '--10x', cellranger_annotation] if verbose: print('Running command: %s\n' % (' '.join(cmd))) run(cmd, env=env) # logs are printed to terminal
[docs]def parsedb_heavy(db_file: Union[str, PathLike], verbose: bool = False): """ Parses AIRR table (heavy chain contigs only). Parameters ---------- db_file : PathLike path to AIRR table. verbose : bool whether or not to print the command used in terminal. Default is False. """ outname = os.path.basename(db_file).split('.tsv')[0] + '_heavy' cmd = ['ParseDb.py', 'select', '-d', db_file, '-f', 'locus', '-u', 'IGH', '--logic', 'all', '--regex', '--outname', outname ] if verbose: print('Running command: %s\n' % (' '.join(cmd))) run(cmd) # logs are printed to terminal
[docs]def parsedb_light(db_file: Union[str, PathLike], verbose: bool = False): """ Parses AIRR table (light chain contigs only). Parameters ---------- db_file : PathLike path to AIRR table. verbose : bool whether or not to print the command used in terminal. Default is False. """ outname = os.path.basename(db_file).split('.tsv')[0] + '_light' cmd = ['ParseDb.py', 'select', '-d', db_file, '-f', 'locus', '-u', 'IG[LK]', '--logic', 'all', '--regex', '--outname', outname ] if verbose: print('Running command: %s\n' % (' '.join(cmd))) run(cmd) # logs are printed to terminal
[docs]def creategermlines(db_file: Union[str, PathLike], germtypes: Union[None, str] = None, germline: Union[None, PathLike, str] = None, org: Literal['human', 'mouse'] = 'human', genotype_fasta: Union[None, PathLike, str] = None, v_field: Union[None, Literal['v_call', 'v_call_genotyped']] = None, cloned: bool = False, mode: Union[None, Literal['heavy', 'light']] = None, verbose: bool = False): """ Wrapper for CreateGermlines.py for reconstructing germline sequences, Parameters ---------- db_file : PathLike path to AIRR table. germtypes : str, optional germline type for reconstuction. germline : PathLike, optional location to germline fasta files. org : str organism for germline sequences. genotype_fasta : PathLike, optional location to corrected v germine fasta file. v_field : str, optional name of column for v segment to perform reconstruction. cloned : bool whether or not to run with cloned option. mode : str, optional whether to run on heavy or light mode. If left as None, heavy and light will be run together. verbose : bool whether or not to print the command used in terminal. Default is False. """ env = os.environ.copy() if germline is None: try: gml = env['GERMLINE'] except: raise OSError( 'Environmental variable GERMLINE must be set. Otherwise, please provide path to folder containing germline fasta files.') gml = gml+'imgt/'+org+'/vdj/' else: env['GERMLINE'] = germline gml = germline if germtypes is None: germ_type = 'dmask' else: germ_type = germtypes if cloned: if mode == 'heavy': print(' Reconstructing heavy chain {} germline sequences with {} for each clone.'.format( germ_type, v_field)) if genotype_fasta is None: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', gml+'/imgt_'+org+'_IGHV.fasta', gml+'/imgt_' + org+'_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', gml, '--vf', v_field ] else: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', genotype_fasta, gml+'/imgt_'+org + '_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', genotype_fasta, gml, '--vf', v_field ] elif mode == 'light': print(' Reconstructing light chain {} germline sequences with {} for each clone.'.format( germ_type, v_field)) if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', gml+'/imgt_'+org+'_IGKV.fasta', gml+'/imgt_'+org+'_IGKJ.fasta', gml + '/imgt_'+org+'_IGLV.fasta', gml+'/imgt_'+org+'_IGLJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', gml, '--vf', v_field ] elif mode is None: print(' Reconstructing {} germline sequences with {} for each clone.'.format( germ_type, v_field)) if genotype_fasta is None: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', gml+'/imgt_'+org+'_IGHV.fasta', gml+'/imgt_'+org+'_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', gml+'/imgt_' + org+'_IGKV.fasta', gml+'/imgt_'+org+'_IGKJ.fasta', gml + '/imgt_'+org+'_IGLV.fasta', gml+'/imgt_'+org+'_IGLJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', gml, '--vf', v_field ] else: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', genotype_fasta, gml+'/imgt_'+org+'_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', gml+'/imgt_'+org + '_IGKV.fasta', gml+'/imgt_'+org+'_IGKJ.fasta', gml+'/imgt_' + org+'_IGLV.fasta', gml+'/imgt_'+org+'_IGLJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '--cloned', '-r', genotype_fasta, gml, '--vf', v_field ] else: if mode == 'heavy': print(' Reconstructing heavy chain {} germline sequences with {}.'.format( germ_type, v_field)) if genotype_fasta is None: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', gml+'/imgt_'+org+'_IGHV.fasta', gml+'/imgt_' + org+'_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', gml, '--vf', v_field ] else: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', genotype_fasta, gml+'/imgt_'+org + '_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', genotype_fasta, gml, '--vf', v_field ] elif mode == 'light': print(' Reconstructing light chain {} germline sequences with {}.'.format( germ_type, v_field)) if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', gml+'/imgt_'+org+'_IGKV.fasta', gml+'/imgt_'+org+'_IGKJ.fasta', gml + '/imgt_'+org+'_IGLV.fasta', gml+'/imgt_'+org+'_IGLJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', gml, '--vf', v_field ] elif mode is None: print(' Reconstructing {} germline sequences with {} for each clone.'.format( germ_type, v_field)) if genotype_fasta is None: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', gml+'/imgt_'+org+'_IGHV.fasta', gml+'/imgt_'+org+'_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', gml+'/imgt_' + org+'_IGKV.fasta', gml+'/imgt_'+org+'_IGKJ.fasta', gml + '/imgt_'+org+'_IGLV.fasta', gml+'/imgt_'+org+'_IGLJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', gml, '--vf', v_field ] else: if germline is None: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', genotype_fasta, gml+'/imgt_'+org+'_IGHD.fasta', gml+'/imgt_'+org+'_IGHJ.fasta', gml+'/imgt_'+org + '_IGKV.fasta', gml+'/imgt_'+org+'_IGKJ.fasta', gml+'/imgt_' + org+'_IGLV.fasta', gml+'/imgt_'+org+'_IGLJ.fasta', '--vf', v_field ] else: cmd = ['CreateGermlines.py', '-d', db_file, '-g', germ_type, '-r', genotype_fasta, gml, '--vf', v_field ] if verbose: print('Running command: %s\n' % (' '.join(cmd))) run(cmd, env=env) # logs are printed to terminal
[docs]def tigger_genotype(data: Union[str, PathLike], v_germline: Union[None, PathLike, str] = None, outdir: Union[None, PathLike, str] = None, org: Literal['human', 'mouse'] = 'human', fileformat: Literal['airr', 'changeo'] = 'airr', novel_: Literal['YES', 'NO'] = 'YES', verbose: bool = False): """ Reassign alleles with TIgGER in R. Parameters ---------- data : PathLike vdj tabulated data, in Change-O (TAB) or AIRR (TSV) format. germline : PathLike, optional fasta file containing IMGT-gapped V segment reference germlines. Defaults to $GERMLINE. outdir : PathLike, optional output directory. Will be created if it does not exist. Defaults to the current working directory. org : str organism for germline sequences. fileformat : str format for running tigger. Default is 'airr'. Also accepts 'changeo'. novel : str whether or not to run novel allele discovery. Default is 'YES'. verbose : bool whether or not to print the command used in terminal. Default is False. """ start_time = time() env = os.environ.copy() if v_germline is None: try: gml = env['GERMLINE'] except: raise OSError('Environmental variable GERMLINE is not set. Please provide either the path to the folder containing the germline IGHV fasta file, or direct path to the germline IGHV fasta file.') gml = gml+'imgt/'+org+'/vdj/imgt_'+org+'_IGHV.fasta' else: if os.path.isdir(v_germline): gml = v_germline.rstrip('/') + 'imgt_'+org+'_IGHV.fasta' if not os.path.isfile(gml): raise OSError("Input for germline is incorrect. Please rename IGHV germline file to '{}'. Otherwise, please provide path to folder containing the germline IGHV fasta file, or direct path to the germline IGHV fasta file.".format(gml)) else: if not v_germline.endswith('.fasta'): raise OSError( 'Input for germline is incorrect {}. Please provide path to folder containing the germline IGHV fasta file, or direct path to the germline IGHV fasta file.'.format(v_germline)) if (os.path.isfile(v_germline)) & ('ighv' in v_germline.lower()): gml = v_germline if outdir is not None: out_dir = outdir + '/' else: out_dir = os.path.dirname(data) cmd = ['tigger-genotype.R', '-d', data, '-r', gml, '-n', os.path.basename(data).split('.tsv')[0], '-N', novel_, '-o', out_dir, '-f', fileformat] print(' Reassigning alleles') if verbose: print('Running command: %s\n' % (' '.join(cmd))) run(cmd, env=env) # logs are printed to terminal elapsed_time_secs = time() - start_time msg = "tigger-genotype execution took: %s secs (Wall clock time)\n" % timedelta( seconds=round(elapsed_time_secs)) if verbose: print(msg)
[docs]def recipe_scanpy_qc(self: AnnData, max_genes: int = 2500, min_genes: int = 200, mito_cutoff: int = 5, pval_cutoff: float = 0.1, min_counts: Union[None, int] = None, max_counts: Union[None, int] = None, blacklist: Union[None, Sequence] = None) -> AnnData: """ Recipe for running a standard scanpy QC workflow. Parameters ---------- adata : AnnData annotated data matrix of shape n_obs × n_vars. Rows correspond to cells and columns to genes. max_genes : int naximum number of genes expressed required for a cell to pass filtering. Default is 2500. min_genes : int minimum number of genes expressed required for a cell to pass filtering. Default is 200. mito_cutoff : float maximum percentage mitochondrial content allowed for a cell to pass filtering. Default is 5. pval_cutoff : float maximum Benjamini-Hochberg corrected p value from doublet detection protocol allowed for a cell to pass filtering. Default is 0.05. min_counts : int, optional minimum number of counts required for a cell to pass filtering. Default is None. max_counts : int, optional maximum number of counts required for a cell to pass filtering. Default is None. blacklist : sequence, optional if provided, will exclude these genes from highly variable genes list. Returns ------- `AnnData` of shape n_obs × n_vars where obs now contain filtering information. Rows correspond to cells and columns to genes. """ _adata = self.copy() # run scrublet try: import scrublet as scr except: raise ImportError('Please install scrublet with pip install scrublet.') scrub = scr.Scrublet(_adata.X) doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False) _adata.obs['scrublet_score'] = doublet_scores # overcluster prep. run basic scanpy pipeline sc.pp.filter_cells(_adata, min_genes=0) mito_genes = _adata.var_names.str.startswith('MT-') _adata.obs['percent_mito'] = np.sum( _adata[:, mito_genes].X, axis=1) / np.sum(_adata.X, axis=1)*100 _adata.obs['n_counts'] = _adata.X.sum(axis=1) sc.pp.normalize_total(_adata, target_sum=1e4) sc.pp.log1p(_adata) sc.pp.highly_variable_genes( _adata, min_mean=0.0125, max_mean=3, min_disp=0.5) for i in _adata.var.index: if re.search('^TR[AB][VDJ]|^IG[HKL][VDJC]', i): _adata.var.at[i, 'highly_variable'] = False if blacklist is not None: if i in blacklist: _adata.var.at[i, 'highly_variable'] = False _adata = _adata[:, _adata.var['highly_variable']] sc.pp.scale(_adata, max_value=10) sc.tl.pca(_adata, svd_solver='arpack') sc.pp.neighbors(_adata, n_neighbors=10, n_pcs=50) # overclustering proper - do basic clustering first, then cluster each cluster sc.tl.leiden(_adata) for clus in list(np.unique(_adata.obs['leiden']))[0]: sc.tl.leiden(_adata, restrict_to=( 'leiden', [clus]), key_added='leiden_R') # weird how the new anndata/scanpy is forcing this for clus in list(np.unique(_adata.obs['leiden']))[1:]: sc.tl.leiden(_adata, restrict_to=( 'leiden_R', [clus]), key_added='leiden_R') # compute the cluster scores - the median of Scrublet scores per overclustered cluster for clus in np.unique(_adata.obs['leiden_R']): _adata.obs.loc[_adata.obs['leiden_R'] == clus, 'scrublet_cluster_score'] = \ np.median( _adata.obs.loc[_adata.obs['leiden_R'] == clus, 'scrublet_score']) # now compute doublet p-values. figure out the median and mad (from above-median values) for the distribution med = np.median(_adata.obs['scrublet_cluster_score']) mask = _adata.obs['scrublet_cluster_score'] > med mad = np.median(_adata.obs['scrublet_cluster_score'][mask]-med) # 1 sided test for catching outliers pvals = 1 - \ scipy.stats.norm.cdf( _adata.obs['scrublet_cluster_score'], loc=med, scale=1.4826*mad) _adata.obs['scrublet_score_bh_pval'] = bh(pvals) # threshold the p-values to get doublet calls. _adata.obs['is_doublet'] = _adata.obs['scrublet_score_bh_pval'] < pval_cutoff _adata.obs['is_doublet'] = _adata.obs['is_doublet'].astype('category') _adata.obs['filter_rna'] = (pd.Series([min_genes < n > max_genes for n in _adata.obs['n_genes']], index=_adata.obs.index)) | \ (_adata.obs['percent_mito'] >= mito_cutoff) | \ (_adata.obs['is_doublet'] == True) # removing columns that probably don't need anymore _adata.obs = _adata.obs.drop( ['leiden', 'leiden_R', 'scrublet_cluster_score', 'scrublet_score_bh_pval'], axis=1) self.obs = _adata.obs.copy()