#!/usr/bin/env python
# @Author: kt16
# @Date: 2020-05-12 14:01:32
# @Last Modified by: Kelvin
# @Last Modified time: 2021-03-30 17:14:26
import os
import pandas as pd
import numpy as np
import scipy.sparse
import networkx as nx
import bz2
import gzip
import _pickle as cPickle
from ..utilities._utilities import *
from ..utilities._core import *
from typing import Union, Sequence, Tuple
def fasta_iterator(fh: str):
'''
Read in a fasta file as an iterator
'''
while True:
line = fh.readline()
if line.startswith('>'):
break
while True:
header = line[1:-1].rstrip()
sequence = fh.readline().rstrip()
while True:
line = fh.readline()
if not line:
break
if line.startswith('>'):
break
sequence += line.rstrip()
yield(header, sequence)
if not line:
return
def Write_output(out: str, file: str):
'''
general line writer
'''
fh = open(file, "a")
fh.write(out)
fh.close()
return()
[docs]def load_data(obj: Union[pd.DataFrame, str]) -> pd.DataFrame:
"""
Reads in or copy dataframe object and set sequence_id as index without dropping.
Parameters
----------
obj : DataFrame, str
file path to .tsv file or pandas DataFrame object.
Returns
-------
pandas DataFrame object.
"""
if os.path.isfile(str(obj)):
try:
obj_ = pd.read_csv(obj, sep='\t')
except FileNotFoundError as e:
print(e)
elif isinstance(obj, pd.DataFrame):
obj_ = obj.copy()
else:
raise TypeError(
"Either input is not of <class 'pandas.core.frame.DataFrame'> or file does not exist.")
if 'sequence_id' in obj_.columns:
obj_.set_index('sequence_id', drop=False, inplace=True)
else:
raise KeyError("'sequence_id' not found in columns of input")
return(obj_)
[docs]def read_pkl(filename: str = 'dandelion_data.pkl.pbz2') -> Dandelion:
"""
Reads in and returns a `Dandelion` class saved using pickle format.
Parameters
----------
filename : str
path to `.pkl` file. Depending on the extension, it will try to unzip accordingly.
Returns
-------
Dandelion object.
"""
if isBZIP(filename):
data = bz2.BZ2File(filename, 'rb')
data = cPickle.load(data)
elif isGZIP(filename):
data = gzip.open(filename, 'rb')
data = cPickle.load(data)
else:
with open(filename, 'rb') as f:
data = cPickle.load(f)
return(data)
[docs]def read_h5(filename: str = 'dandelion_data.h5') -> Dandelion:
"""
Reads in and returns a `Dandelion` class from .h5 format.
Parameters
----------
filename : str
path to `.h5` file
Returns
-------
`Dandelion` object.
"""
try:
data = pd.read_hdf(filename, 'data')
except:
raise AttributeError(
'{} does not contain attribute `data`'.format(filename))
try:
metadata = pd.read_hdf(filename, 'metadata')
except:
pass
try:
edges = pd.read_hdf(filename, 'edges')
except:
pass
try:
g_0 = pd.read_hdf(filename, 'graph/graph_0')
g_1 = pd.read_hdf(filename, 'graph/graph_1')
g_0 = g_0 + 1
g_0 = g_0.fillna(0)
g_1 = g_1 + 1
g_1 = g_1.fillna(0)
graph0 = nx.from_pandas_adjacency(g_0)
graph1 = nx.from_pandas_adjacency(g_1)
for u, v, d in graph0.edges(data=True):
d['weight'] = d['weight']-1
for u, v, d in graph1.edges(data=True):
d['weight'] = d['weight']-1
graph = (graph0, graph1)
except:
pass
with h5py.File(filename, 'r') as hf:
try:
layout0 = {}
for k in hf['layout/layout_0'].attrs.keys():
layout0.update({k: np.array(hf['layout/layout_0'].attrs[k])})
layout1 = {}
for k in hf['layout/layout_1'].attrs.keys():
layout1.update({k: np.array(hf['layout/layout_1'].attrs[k])})
layout = (layout0, layout1)
except:
pass
germline = {}
try:
for g in hf['germline'].attrs:
germline.update({g: hf['germline'].attrs[g]})
except:
pass
distance = Tree()
try:
for d in hf['distance'].keys():
d_ = pd.read_hdf(filename, 'distance/'+d)
distance[d] = scipy.sparse.csr_matrix(d_.values)
except:
pass
try:
threshold = np.float(np.array(hf['threshold']))
except:
threshold = None
constructor = {}
constructor['data'] = data
if 'metadata' in locals():
constructor['metadata'] = metadata
if 'germline' in locals():
constructor['germline'] = germline
if 'edges' in locals():
constructor['edges'] = edges
if 'distance' in locals():
constructor['distance'] = distance
if 'layout' in locals():
constructor['layout'] = layout
if 'graph' in locals():
constructor['graph'] = graph
try:
res = Dandelion(**constructor)
except:
res = Dandelion(**constructor, initialize=False)
if 'threshold' in locals():
res.threshold = threshold
else:
pass
return(res)
[docs]def read_10x_airr(file: str) -> Dandelion:
"""
Reads the 10x AIRR rearrangement .tsv directly and returns a `Dandelion` object.
Parameters
----------
file : str
path to `airr_rearrangement.tsv`
Returns
-------
`Dandelion` object of pandas data frame.
"""
dat = load_data(file)
# get all the v,d,j,c calls
if 'locus' not in dat:
tmp = [(v, d, j, c) for v, d, j, c in zip(
dat['v_call'], dat['d_call'], dat['j_call'], dat['c_call'])]
locus = []
for t in tmp:
if all('IGH' in x for x in t if x == x):
locus.append('IGH')
elif all('IGK' in x for x in t if x == x):
locus.append('IGK')
elif all('IGL' in x for x in t if x == x):
locus.append('IGL')
else:
locus.append(np.nan)
dat['locus'] = locus
return(Dandelion(dat))
[docs]def to_scirpy(data: Dandelion, transfer: bool = False) -> AnnData:
"""
Converts a `Dandelion` object to scirpy's format.
Parameters
----------
data : Dandelion
`Dandelion` object
transfer : bool
Whether to execute :func:`dandelion.tl.transfer` to transfer all data
to the :class:`anndata.AnnData` instance.
Returns
-------
`AnnData` object in the format initialized by `scirpy`.
"""
try:
import scirpy as ir
except:
raise ImportError('Please install scirpy. pip install scirpy')
if 'duplicate_count' not in data.data and 'umi_count' in data.data:
data.data['duplicate_count'] = data.data['umi_count']
return(ir.io.from_dandelion(data, transfer))
[docs]def from_scirpy(adata: AnnData, clone_key: Union[None, str] = None, key_added: Union[None, str] = None, mapping_mode: Literal['chain', 'cell'] = 'chain') -> Dandelion:
"""
Reads a `scirpy` initialized `AnnData` oject and returns a `Dandelion` object.
Parameters
----------
adata : AnnData
`scirpy` initialized `AnnData` object.
clone_key : str, optional
column name for `clone_id` in `AnnData`. None defaults to `clonotype` in `scirpy` initialized object.
key_added : str, optional
column name for `clone_id` in `Dandelion`. None defaults to `clone_id` in `dandelion` initialized object.
mapping_mode : str
mode for retrieving the clone_id calls, either based on cells (all chains/contigs have the same call) or chains (allow for different calls between chains).
Returns
-------
`Dandelion` object.
"""
try:
import scirpy as ir
except:
raise ImportError('Please install scirpy. pip install scirpy')
if clone_key is None:
clonekey_s = 'clonotype'
else:
clonekey_s = clone_key
if key_added is None:
clonekey_d = 'clone_id'
else:
clonekey_d = key_added
airr_cells = ir.io.to_ir_objs(adata)
tmp_ = ir.io.to_dandelion(adata)
tmp = tmp_.data.copy()
if clonekey_d in adata.obs:
cell_clonotype_dict = dict(zip(adata.obs.index, adata.obs[clonekey_d]))
elif clonekey_s in adata.obs:
cell_clonotype_dict = dict(zip(adata.obs.index, adata.obs[clonekey_s]))
else:
cell_clonotype_dict = {}
for c in airr_cells:
clones_ = '|'.join([cx['clone_id'] if isinstance(
cx['clone_id'], str) else '' for cx in c.chains])
cell_clonotype_dict[c.cell_id] = clones_
if mapping_mode == 'cell':
tmp[clonekey_d] = [cell_clonotype_dict[x] for x in tmp['cell_id']]
elif mapping_mode == 'chain':
clone_dict = {}
for c in airr_cells:
for cx in c.chains:
clone_dict[cx['sequence_id']] = cx['clone_id']
if all(v == '' for v in clone_dict.values()) or all(pd.isnull(v) for v in clone_dict.values()):
clone_dict = {}
for c in airr_cells:
for cx in c.chains:
clone_dict[cx['sequence_id']] = cell_clonotype_dict[c.cell_id]
tmp[clonekey_d] = [clone_dict[x] for x in tmp['sequence_id']]
tmp_.__init__(data=tmp)
return(tmp_)