#!/usr/bin/env python
# @Author: Kelvin
# @Date: 2021-02-11 12:22:40
# @Last Modified by: Kelvin
# @Last Modified time: 2021-03-30 15:30:49
import os
from collections import defaultdict
import pandas as pd
import numpy as np
import re
import copy
from changeo.IO import readGermlines
import warnings
import h5py
import networkx as nx
import bz2
import gzip
from anndata import AnnData
import _pickle as cPickle
try:
from scanpy import logging as logg
except ImportError:
pass
from ..utilities._utilities import *
from ..utilities._io import *
from typing import Union, Sequence, Tuple, Dict
[docs]class Dandelion:
"""
`Dandelion` class object.
Main class object storing input/ouput slots for all functions.
"""
[docs] def __init__(self, data=None, metadata=None, germline=None, distance=None, edges=None, layout=None, graph=None, initialize=True, **kwargs):
self.data = data
self.metadata = metadata
self.distance = distance
self.edges = edges
self.layout = layout
self.graph = graph
self.threshold = None
self.germline = {}
if germline is not None:
self.germline.update(germline)
if os.path.isfile(str(self.data)):
self.data = load_data(self.data)
if self.data is not None:
self.n_contigs = self.data.shape[0]
if metadata is None:
if initialize is True:
update_metadata(self, **kwargs)
try:
self.n_obs = self.metadata.shape[0]
except:
self.n_obs = 0
else:
self.metadata = metadata
self.n_obs = self.metadata.shape[0]
else:
self.n_contigs = 0
self.n_obs = 0
def _gen_repr(self, n_obs, n_contigs) -> str:
# inspire by AnnData's function
descr = f"Dandelion class object with n_obs = {n_obs} and n_contigs = {n_contigs}"
for attr in ["data", "metadata", "distance", "edges"]:
try:
keys = getattr(self, attr).keys()
except:
keys = []
if len(keys) > 0:
descr += f"\n {attr}: {str(list(keys))[1:-1]}"
else:
descr += f"\n {attr}: {str(None)}"
if self.layout is not None:
descr += f"\n layout: {', '.join(['layout for '+ str(len(x)) + ' vertices' for x in (self.layout[0], self.layout[1])])}"
else:
descr += f"\n layout: {str(None)}"
if self.graph is not None:
descr += f"\n graph: {', '.join(['networkx graph of '+ str(len(x)) + ' vertices' for x in (self.graph[0], self.graph[1])])} "
else:
descr += f"\n graph: {str(None)}"
return descr
def __repr__(self) -> str:
# inspire by AnnData's function
return self._gen_repr(self.n_obs, self.n_contigs)
[docs] def copy(self):
"""
Performs a deep copy of all slots in `Dandelion` class.
Parameters
----------
self : Dandelion
`Dandelion` object.
Returns
-------
a deep copy of `Dandelion` class.
"""
return copy.deepcopy(self)
[docs] def update_germline(self, corrected: Union[None, Dict, str] = None, germline: Union[None, str] = None, org: Literal['human', 'mouse'] = 'human'):
"""
Update germline reference with corrected sequences and store in `Dandelion` object.
Parameters
----------
self : Dandelion
`Dandelion` object.
corrected : dict, str, optional
dictionary of corrected germline sequences or file path to corrected germline sequences fasta file.
germline : str, optional
path to germline database folder. Defaults to `$GERMLINE` environmental variable.
org : str
organism of reference folder. Default is 'human'.
Returns
-------
updated germline reference diciontary in `.germline` slot.
"""
start = logg.info('Updating germline reference')
env = os.environ.copy()
if germline is None:
try:
gml = env['GERMLINE']
except:
raise OSError(
'Environmental variable GERMLINE must be set. Otherwise, please provide path to folder containing germline IGHV, IGHD, and IGHJ fasta files.')
gml = gml+'imgt/'+org+'/vdj/'
else:
if os.path.isdir(germline):
gml = germline
elif type(germline) is not list:
germline_ = [germline]
if len(germline_) < 3:
raise OSError('Input for germline is incorrect. Please provide path to folder containing germline IGHV, IGHD, and IGHJ fasta files, or individual paths to the germline IGHV, IGHD, and IGHJ fasta files (with .fasta extension) as a list.')
else:
gml = []
for x in germline_:
if not x.endswith('.fasta'):
raise OSError(
'Input for germline is incorrect. Please provide path to folder containing germline IGHV, IGHD, and IGHJ fasta files, or individual paths to the germline IGHV, IGHD, and IGHJ fasta files (with .fasta extension) as a list.')
gml.append(x)
elif type(germline) is list:
if len(germline) < 3:
raise OSError('Input for germline is incorrect. Please provide path to folder containing germline IGHV, IGHD, and IGHJ fasta files, or individual paths to the germline IGHV, IGHD, and IGHJ fasta files (with .fasta extension) as a list.')
else:
gml = []
for x in germline:
if not x.endswith('.fasta'):
raise OSError(
'Input for germline is incorrect. Please provide path to folder containing germline IGHV, IGHD, and IGHJ fasta files, or individual paths to the germline IGHV, IGHD, and IGHJ fasta files (with .fasta extension) as a list.')
gml.append(x)
if type(gml) is not list:
gml = [gml]
germline_ref = readGermlines(gml)
if corrected is not None:
if type(corrected) is dict:
personalized_ref_dict = corrected
elif os.path.isfile(str(corrected)):
personalized_ref_dict = readGermlines([corrected])
# update with the personalized germline database
if 'personalized_ref_dict' in locals():
germline_ref.update(personalized_ref_dict)
else:
raise OSError(
'Input for corrected germline fasta is incorrect. Please provide path to file containing corrected germline fasta sequences.')
self.germline.update(germline_ref)
logg.info(' finished', time=start,
deep=('Updated Dandelion object: \n'
' \'germline\', updated germline reference\n'))
[docs] def write_pkl(self, filename: str = 'dandelion_data.pkl.pbz2', **kwargs):
"""
Writes a `Dandelion` class to .pkl format.
Parameters
----------
filename
path to `.pkl` file.
**kwargs
passed to `_pickle`.
"""
if isBZIP(filename):
try:
with bz2.BZ2File(filename, 'wb') as f:
cPickle.dump(self, f, **kwargs)
except:
with bz2.BZ2File(filename, 'wb') as f:
cPickle.dump(self, f, protocol=4, **kwargs)
elif isGZIP(filename):
try:
with gzip.open(filename, 'wb') as f:
cPickle.dump(self, f, **kwargs)
except:
with gzip.open(filename, 'wb') as f:
cPickle.dump(self, f, protocol=4, **kwargs)
else:
f = open(filename, 'wb')
cPickle.dump(self, f, **kwargs)
f.close()
[docs] def write_h5(self, filename: str = 'dandelion_data.h5', complib: Literal['zlib', 'lzo', 'bzip2', 'blosc', 'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd'] = None, compression: Literal['zlib', 'lzo', 'bzip2', 'blosc', 'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd'] = None, compression_level: Union[None, int] = None, **kwargs):
"""
Writes a `Dandelion` class to .h5 format.
Parameters
----------
filename
path to `.h5` file.
complib : str, optional
method for compression for data frames. see (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_hdf.html) for more options.
compression : str, optional
same call as complib. Just a convenience option.
compression_opts : {0-9}, optional
Specifies a compression level for data. A value of 0 disables compression.
**kwargs
passed to `pd.DataFrame.to_hdf`.
"""
if compression_level is None:
compression_level = 9
else:
compression_level = compression_level
# a little hack to overwrite the existing file?
with h5py.File(filename, "w") as hf:
for datasetname in hf.keys():
del hf[datasetname]
if complib is None and compression is None:
comp = None
elif complib is not None and compression is None:
comp = complib
elif complib is None and compression is not None:
comp = compression
if complib is not None and compression is not None:
raise ValueError(
'Please specify only complib or compression. They do the same thing.')
# now to actually saving
for col in self.data.columns:
weird = (self.data[[col]].applymap(type) !=
self.data[[col]].iloc[0].apply(type)).any(axis=1)
if len(self.data[weird]) > 0:
self.data[col] = self.data[col].where(
pd.notnull(self.data[col]), '')
self.data.to_hdf(filename, "data", complib=comp,
complevel=compression_level, **kwargs)
if self.metadata is not None:
for col in self.metadata.columns:
weird = (self.metadata[[col]].applymap(
type) != self.metadata[[col]].iloc[0].apply(type)).any(axis=1)
if len(self.metadata[weird]) > 0:
self.metadata[col] = self.metadata[col].where(
pd.notnull(self.metadata[col]), '')
self.metadata.to_hdf(filename, "metadata", complib=comp,
complevel=compression_level, format='table', nan_rep=np.nan, **kwargs)
# except:
# warnings.warn("`metadata` slot not saved. Please check if there is incompatible dtypes in the metadata table.")
# pass
try:
if 'index' in self.edges.columns:
self.edges.drop('index', axis=1, inplace=True)
self.edges.to_hdf(filename, "edges", complib=comp,
complevel=compression_level, **kwargs)
except:
pass
graph_counter = 0
try:
for g in self.graph:
G = nx.to_pandas_adjacency(g, nonedge=np.nan)
G.to_hdf(filename, "graph/graph_"+str(graph_counter),
complib=comp, complevel=compression_level, **kwargs)
graph_counter += 1
except:
pass
try:
for d in self.distance:
# how to make this faster?
dat = pd.DataFrame(self.distance[d].toarray())
dat.to_hdf(filename, "distance/"+d, complib=comp,
complevel=compression_level, **kwargs)
except:
pass
with h5py.File(filename, "a") as hf:
# try:
# for d in self.distance:
# hf.create_dataset('distance/'+d+'/data', data=self.distance[d].data, compression = compression, compression_opts = compression_level)
# hf.create_dataset('distance/'+d+'/indptr', data=self.distance[d].indptr, compression = compression, compression_opts = compression_level)
# hf.create_dataset('distance/'+d+'/indices', data=self.distance[d].indices, compression = compression, compression_opts = compression_level)
# hf['distance/'+d].attrs['shape'] = self.distance[d].shape
# except:
# pass
try:
layout_counter = 0
for l in self.layout:
try:
hf.create_group('layout/layout_'+str(layout_counter))
except:
pass
for k in l.keys():
hf['layout/layout_' +
str(layout_counter)].attrs[k] = l[k]
layout_counter += 1
except:
pass
if len(self.germline) > 0:
try:
hf.create_group('germline')
except:
pass
for k in self.germline.keys():
hf['germline'].attrs[k] = self.germline[k]
if self.threshold is not None:
tr = self.threshold
hf.create_dataset('threshold', data=tr)
[docs]def concat(arrays: Sequence[Union[pd.DataFrame, Dandelion]], check_unique: bool = True) -> Dandelion:
"""
Concatenate dataframe and return as `Dandelion` object.
Parameters
----------
arrays : Sequence
List of `Dandelion` class objects or pandas dataframe
check_unique : bool
Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this method.
Returns
-------
`Dandelion` object
"""
arrays = list(arrays)
try:
arrays_ = [x.data.copy() for x in arrays]
except:
arrays_ = [x.copy() for x in arrays]
if check_unique:
try:
df = pd.concat(arrays_, verify_integrity=True)
except:
for i in range(0, len(arrays)):
arrays_[i]['sequence_id'] = [x + '__' +
str(i) for x in arrays_[i]['sequence_id']]
arrays_ = [load_data(x) for x in arrays_]
df = pd.concat(arrays_, verify_integrity=True)
else:
df = pd.concat(arrays_)
try:
out = Dandelion(df)
except:
out = Dandelion(df, initialize=False)
return(out)
def retrieve_metadata(data: pd.DataFrame, query: str, split: bool = True, collapse: bool = True, combine: bool = False, locus: Literal['ig'] = 'ig', split_locus: bool = False, verbose: bool = False) -> pd.DataFrame:
data_tmp = data.copy()
dat_dict = defaultdict(dict)
dict_ = defaultdict(dict)
metadata_ = defaultdict(dict)
locus_dict1 = {'ig': 'IGH'}
locus_dict2 = {'ig': ['IGK', 'IGL']}
locus_dict3 = {'ig': 'H'}
locus_dict4 = {'ig': 'L'}
query_dict = dict(zip(data_tmp['sequence_id'], data_tmp[query]))
if type_check(data, query):
data_tmp[query].fillna('unassigned', inplace=True)
typesoflocus = len(list(set(data_tmp['locus'])))
if typesoflocus > 1:
if split_locus:
for loci in flatten([locus_dict1[locus]] + locus_dict2[locus]):
tmp = data_tmp[data_tmp['locus'].isin([loci])].copy()
if tmp.shape[0] > 0:
dat_dict[loci] = tmp.copy()
else:
tmp3 = data_tmp[data_tmp['locus'].isin(
[locus_dict1[locus]])].copy()
tmp4 = data_tmp[data_tmp['locus'].isin(locus_dict2[locus])].copy()
if tmp3.shape[0] > 0:
dat_dict[locus_dict3[locus]] = tmp3.copy()
if tmp4.shape[0] > 0:
dat_dict[locus_dict4[locus]] = tmp4.copy()
else:
if verbose:
warnings.warn(UserWarning(
'Single locus type detected. Ignoring split = True and split_locus = True.'))
dat_dict[locus_dict3[locus]] = data_tmp[data_tmp['locus'].isin(
[locus_dict1[locus]])].copy()
for d in dat_dict:
tmp = Tree()
for cell, seq in zip(dat_dict[d]['cell_id'], dat_dict[d]['sequence_id']):
tmp[cell][seq].value = 1
cells = []
seqs = []
for t in tmp:
cells.append(t)
seqs.append([s for s in tmp[t]])
metadata_[d] = pd.DataFrame(seqs, index=cells)
metadata_[d][pd.isnull(metadata_[d])] = np.nan
if split_locus:
H = locus_dict1[locus]
else:
H = locus_dict3[locus]
if len(metadata_[H].columns) > 1:
metadata_[H].columns = ['sequence_id_'+H+'_' +
str(x) for x in range(0, len(metadata_[H].columns))]
else:
metadata_[H].columns = ['sequence_id_'+H+'_0']
if typesoflocus > 1:
if split_locus:
for L in locus_dict2[locus]:
if len(metadata_[L].columns) > 1:
metadata_[L].columns = ['sequence_id_'+L+'_' +
str(x) for x in range(0, len(metadata_[L].columns))]
else:
metadata_[L].columns = ['sequence_id_'+L+'_0']
else:
L = locus_dict4[locus]
if len(metadata_[L].columns) > 1:
metadata_[L].columns = ['sequence_id_'+L+'_' +
str(x) for x in range(0, len(metadata_[L].columns))]
else:
metadata_[L].columns = ['sequence_id_'+L+'_0']
metadata_result = metadata_.copy()
for l in metadata_:
for x in metadata_[l]:
metadata_result[l][x] = [query_dict[i] if i ==
i else np.nan for i in metadata_[l][x]]
if typesoflocus > 1:
if not split_locus:
results = retrieve_result_dict(
query, data_tmp, metadata_result[locus_dict3[locus]], metadata_result[locus_dict4[locus]], locus, split, collapse, combine)
else:
results = retrieve_result_dict(query, data_tmp, metadata_result[locus_dict1[locus]], [
metadata_result[L] for L in locus_dict2[locus]], locus, split, collapse, combine)
else:
results = retrieve_result_dict_singular(
query, data_tmp, metadata_result[locus_dict3[locus]], locus, collapse, combine)
return(results)
def retrieve_result_dict(query: str, data: pd.DataFrame, meta_h: pd.DataFrame, meta_l: pd.DataFrame, locus: Literal['ig'] = 'ig', split: bool = True, collapse: bool = True, combine: bool = False, verbose: bool = False) -> Dict:
df_hl = defaultdict(dict)
locus_dict1 = {'ig': 'IGH'}
locus_dict2 = {'ig': ['IGK', 'IGL']}
locus_dict3 = {'ig': 'H'}
locus_dict4 = {'ig': 'L'}
if len(meta_l) == 2 and type(meta_l) is list:
H = locus_dict1[locus]
else:
H = locus_dict3[locus]
if meta_h.shape[1] > 1:
if collapse:
newmeta_h = meta_h.copy()
if type_check(meta_h, 'sequence_id_'+H+'_0'):
newh = []
for i in meta_h.index:
try:
newh.append(
'|'.join([h for h in list(dict.fromkeys(newmeta_h.loc[i])) if h == h]))
except:
newh.append('|'.join([str(h) for h in list(
dict.fromkeys(newmeta_h.loc[i])) if h == h]))
newmeta_h['sequence_id_'+H+'_0'] = newh
meta_h = pd.DataFrame(newmeta_h['sequence_id_'+H+'_0'].copy())
else:
collapse = False
if verbose:
warnings.warn(UserWarning(
'Multiple heavy chain contigs mapping to the same cell barcode and/or query dtype is {}. Ignoring collapse = True.'.format(meta_h['sequence_id_'+H+'_0'].dtype.name)))
if len(meta_l) == 2 and type(meta_l) is list:
metadata_ = meta_h.join(meta_l[0]).join(meta_l[1])
else:
metadata_ = meta_h.join(meta_l)
df_ = metadata_.copy()
if type_check(meta_h, 'sequence_id_'+H+'_0'):
if split:
df_hl[H] = df_[list(meta_h.columns)].copy()
if len(meta_l) == 2 and type(meta_l) is list:
for x in range(0, len(locus_dict2[locus])):
L = locus_dict2[locus][x]
df_hl[L] = df_[list(meta_l[x].columns)].copy()
else:
L = locus_dict4[locus]
df_hl[L] = df_[list(meta_l.columns)].copy()
df_res_hl = df_hl.copy()
res_ = defaultdict(list)
result_dict_hl = defaultdict(dict)
if collapse:
for d in df_res_hl:
for i in df_hl[d].index:
if combine:
try:
res_[d].append(
'|'.join([l for l in list(dict.fromkeys(df_hl[d].loc[i])) if l == l]))
except:
res_[d].append('|'.join([str(l) for l in list(
dict.fromkeys(df_hl[d].loc[i])) if l == l]))
else:
try:
res_[d].append(
'|'.join([l for l in list(df_hl[d].loc[i]) if l == l]))
except:
res_[d].append(
'|'.join([str(l) for l in list(df_hl[d].loc[i]) if l == l]))
df_res_hl[d][query] = res_[d]
result_dict_hl[d] = dict(df_res_hl[d][query])
for k, v in result_dict_hl[d].items():
if type(v) is not list:
result_dict_hl[d][k] = [v]
if len(meta_l) == 2 and type(meta_l) is list and type(meta_l) is list:
result_dict_ = {query+'_'+H: result_dict_hl[H]}
for x in range(0, len(locus_dict2[locus])):
L = locus_dict2[locus][x]
result_dict_.update({query+'_'+L: result_dict_hl[L]})
else:
result_dict_ = {
query+'_heavy': result_dict_hl[H], query+'_light': result_dict_hl[L]}
else:
for d in df_res_hl:
result_dict_hl[d] = df_res_hl[d]
else:
df_res = df_.copy()
q_res = []
if collapse:
for i in metadata_.index:
if combine:
q_res.append(
'|'.join([qq for qq in list(dict.fromkeys(df_.loc[i])) if qq == qq]))
else:
q_res.append(
'|'.join([qq for qq in list(df_.loc[i]) if qq == qq]))
else:
for i in metadata_.index:
q_res.append([qq for qq in list(df_.loc[i]) if qq == qq])
df_res[query] = q_res
result_dict_ = dict(df_res[query])
else:
result_dict_ = {x: dict(df_[x]) for x in df_}
rs_dict1 = {'ig': '[HL]'}
rs_dict2 = {'ig': 'IG[HKL]'}
if len(meta_l) == 2 and type(meta_l) is list:
rs = '_sequence_id_'+rs_dict2[locus]
else:
rs = '_sequence_id_'+rs_dict1[locus]
final_result_ = defaultdict(dict)
if split:
if not collapse:
if len(meta_l) == 2 and type(meta_l) is list:
names = [k for k in result_dict_hl.keys()]
final_result = result_dict_hl[names[0]].join(
result_dict_hl[names[1]]).join(result_dict_hl[names[2]])
final_result.columns = [re.sub('_sequence_id', '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
try:
if type_check(meta_h, 'sequence_id_'+H+'_0'):
final_result_h = pd.DataFrame(result_dict_hl[H])
else:
final_result_h = pd.DataFrame(result_dict_[H])
except:
if type_check(meta_h, 'sequence_id_'+H+'_0'):
final_result_h = pd.DataFrame.from_dict(
result_dict_hl, orient='index').T
final_result_h = final_result_h[meta_h.columns].copy()
else:
final_result_h = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result_h = final_result_h[meta_h.columns].copy()
final_result_h.columns = [re.sub(rs, '', q) for q in [
query+'_heavy_'+str(l) for l in final_result_h.columns]]
try:
if type_check(meta_h, 'sequence_id_'+H+'_0'):
final_result_l = pd.DataFrame(
result_dict_hl[locus_dict4[locus]])
else:
final_result_l = pd.DataFrame(
result_dict_[locus_dict4[locus]])
except:
if type_check(meta_h, 'sequence_id_'+H+'_0'):
final_result_l = pd.DataFrame.from_dict(
result_dict_hl, orient='index').T
final_result_l = final_result_l[meta_l.columns].copy()
else:
final_result_l = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result_l = final_result_l[meta_l.columns].copy()
final_result_l.columns = [re.sub(rs, '', q) for q in [
query+'_light_'+str(l) for l in final_result_l.columns]]
final_result = final_result_h.join(final_result_l)
else:
if len(meta_l) == 2 and type(meta_l) is list:
if type_check(meta_h, 'sequence_id_'+H+'_0'):
for d in result_dict_:
final_result_[d] = pd.DataFrame.from_dict(
result_dict_[d], orient='index')
final_result_[d].columns = [re.sub(rs, '', q) for q in [
d+'_'+str(l) for l in final_result_[d].columns]]
final_result_[d].columns = [
re.sub('_[0-9]', '', q) for q in final_result_[d].columns]
names = [k for k in final_result_.keys()]
final_result = final_result_[names[0]].join(
final_result_[names[1]]).join(final_result_[names[2]])
else:
if verbose:
warnings.warn(UserWarning('Query dtype is {}. Ignoring collapse = True.'.format(
meta_h['sequence_id_'+H+'_0'].dtype.name)))
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result.columns = [
query+re.sub('sequence_id', '', q) for q in final_result.columns]
else:
if type_check(meta_h, 'sequence_id_'+H+'_0'):
final_result_h = pd.DataFrame.from_dict(
result_dict_[query+'_heavy'], orient='index')
final_result_h.columns = [query+'_heavy']
final_result_l = pd.DataFrame.from_dict(
result_dict_[query+'_light'], orient='index')
final_result_l.columns = [query+'_light']
final_result = final_result_h.join(final_result_l)
else:
if verbose:
warnings.warn(UserWarning('Query dtype is {}. Ignoring collapse = True.'.format(
meta_h['sequence_id_'+H+'_0'].dtype.name)))
final_result_h = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result_h = final_result_h[meta_h.columns].copy()
final_result_h.columns = [re.sub(rs, '', q) for q in [
query+'_heavy_'+str(l) for l in final_result_h.columns]]
final_result_l = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result_l = final_result_l[meta_l.columns].copy()
final_result_l.columns = [re.sub(rs, '', q) for q in [
query+'_light_'+str(l) for l in final_result_l.columns]]
final_result = final_result_h.join(final_result_l)
else:
if type_check(meta_h, 'sequence_id_'+H+'_0'):
if not collapse:
if len(meta_l) == 2 and type(meta_l) is list:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index')
final_result.columns = [re.sub(rs, '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index')
final_result.columns = [re.sub(rs, '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
if len(meta_l) == 2 and type(meta_l) is list:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index')
final_result.columns = [query]
else:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index')
final_result.columns = [query]
else:
if not collapse:
if len(meta_l) == 2 and type(meta_l) is list:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result.columns = [re.sub('_sequence_id', '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result.columns = [re.sub(rs, '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
if verbose:
warnings.warn(UserWarning('Query dtype is {}. Ignoring collapse = True and split = False.'.format(
meta_h['sequence_id_'+H+'_0'].dtype.name)))
if len(meta_l) == 2 and type(meta_l) is list:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result.columns = [re.sub('_sequence_id', '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
typedict = {locus_dict3[locus]: 'heavy',
locus_dict4[locus]: 'light'}
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result.columns = [re.sub(rs, '', q) for q in [
query+'_'+typedict[l.split('_')[2]]+'_'+l for l in final_result.columns]]
return(final_result)
def retrieve_result_dict_singular(query: str, data: pd.DataFrame, meta_h: pd.DataFrame, locus: Literal['ig'] = 'ig', collapse: bool = True, combine: bool = False, verbose: bool = False) -> Dict:
df_hl = defaultdict(dict)
locus_dict1 = {'ig': 'IGH'}
locus_dict3 = {'ig': 'H'}
H = locus_dict3[locus]
if meta_h.shape[1] > 1:
if collapse:
newmeta_h = meta_h.copy()
if type_check(meta_h, 'sequence_id_'+H+'_0'):
newh = []
for i in meta_h.index:
try:
newh.append(
'|'.join([h for h in list(dict.fromkeys(newmeta_h.loc[i])) if h == h]))
except:
newh.append('|'.join([str(h) for h in list(
dict.fromkeys(newmeta_h.loc[i])) if h == h]))
newmeta_h['sequence_id_'+H+'_0'] = newh
meta_h = pd.DataFrame(newmeta_h['sequence_id_'+H+'_0'].copy())
else:
collapse = False
if verbose:
warnings.warn(UserWarning(
'Multiple heavy chain contigs mapping to the same cell barcode and/or query dtype is {}. Ignoring collapse = True.'.format(meta_h['sequence_id_'+H+'_0'].dtype.name)))
metadata_ = meta_h.copy()
df_ = metadata_.copy()
if type_check(meta_h, 'sequence_id_'+H+'_0'):
df_res = df_.copy()
q_res = []
if collapse:
for i in metadata_.index:
if combine:
try:
q_res.append(
'|'.join([qq for qq in list(dict.fromkeys(df_.loc[i])) if qq == qq]))
except:
q_res.append('|'.join([str(qq) for qq in list(
dict.fromkeys(df_.loc[i])) if qq == qq]))
else:
try:
q_res.append(
'|'.join([qq for qq in list(df_.loc[i]) if qq == qq]))
except:
q_res.append(
'|'.join([str(qq) for qq in list(df_.loc[i]) if qq == qq]))
else:
for i in metadata_.index:
q_res.append([qq for qq in list(df_.loc[i]) if qq == qq])
df_res[query] = q_res
result_dict_ = dict(df_res[query])
else:
result_dict_ = {x: dict(df_[x]) for x in df_}
rs_dict1 = {'ig': '[H]'}
rs = '_sequence_id_'+rs_dict1[locus]
final_result_ = defaultdict(dict)
if type_check(meta_h, 'sequence_id_'+H+'_0'):
if not collapse:
final_result = pd.DataFrame.from_dict(result_dict_, orient='index')
final_result.columns = [re.sub(rs, '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
final_result = pd.DataFrame.from_dict(result_dict_, orient='index')
final_result.columns = [query]
else:
if not collapse:
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result.columns = [re.sub(rs, '', q) for q in [
query+'_'+str(l) for l in final_result.columns]]
else:
if verbose:
warnings.warn(UserWarning('Query dtype is {}. Ignoring collapse = True and split = False.'.format(
meta_h['sequence_id_'+H+'_0'].dtype.name)))
typedict = {locus_dict3[locus]: 'heavy'}
final_result = pd.DataFrame.from_dict(
result_dict_, orient='index').T
final_result.columns = [re.sub(rs, '', q) for q in [
query+'_'+typedict[l.split('_')[2]]+'_'+l for l in final_result.columns]]
return(final_result)
def initialize_metadata(self, cols: Sequence, locus_: str, clonekey: str, collapse_alleles: bool, verbose: bool) -> Dandelion:
init_dict = {}
for col in cols:
init_dict.update({col: {'split': True, 'collapse': True,
'combine': False, 'locus': locus_, 'split_locus': False}})
if clonekey in init_dict:
init_dict.update({clonekey: {'split': False, 'collapse': True,
'combine': True, 'locus': locus_, 'split_locus': False}})
if 'sample_id' in init_dict:
init_dict.update({'sample_id': {'split': False, 'collapse': True,
'combine': True, 'locus': locus_, 'split_locus': False}})
meta_ = defaultdict(dict)
for k, v in init_dict.copy().items():
if (all(pd.isnull(self.data[k]))) or (all([i == '' for i in self.data[k]])):
init_dict.pop(k)
continue
meta_[k] = retrieve_metadata(self.data, query=k, verbose=verbose, **v)
tmp_metadata = pd.concat(meta_.values(), axis=1, join="inner")
if 'locus_heavy' in tmp_metadata:
suffix_h = '_heavy'
suffix_l = '_light'
else:
suffix_h = ''
suffix_l = ''
if clonekey in init_dict:
tmp_metadata[str(clonekey)] = tmp_metadata[str(
clonekey)].replace('', 'unassigned')
clones = tmp_metadata[str(clonekey)].str.split('|', expand=False)
tmpclones = []
for i in clones:
while 'unassigned' in i:
i.remove('unassigned')
if len(i) == 1:
break
tmpclones.append(i)
tmpclones = ['|'.join(list(set(x))) for x in tmpclones]
tmpclonesdict = dict(zip(tmp_metadata.index, tmpclones))
tmp_metadata[str(clonekey)] = pd.Series(tmpclonesdict)
tmp = tmp_metadata[str(clonekey)].str.split('|', expand=True).stack()
tmp = tmp.reset_index(drop=False)
tmp.columns = ['cell_id', 'tmp', str(clonekey)]
clone_size = tmp[str(clonekey)].value_counts()
if "" in clone_size.index:
clone_size = clone_size.drop("", axis=0)
clonesize_dict = dict(clone_size)
size_of_clone = pd.DataFrame.from_dict(clonesize_dict, orient='index')
size_of_clone.reset_index(drop=False, inplace=True)
size_of_clone.columns = [str(clonekey), 'clone_size']
size_of_clone[str(clonekey)+'_by_size'] = size_of_clone.index+1
size_dict = dict(
zip(size_of_clone[clonekey], size_of_clone[str(clonekey)+'_by_size']))
size_dict.update({'': 'unassigned'})
tmp_metadata[str(clonekey)+'_by_size'] = ['|'.join(sorted(list(set([str(size_dict[c_]) for c_ in c.split('|')]))))
if len(c.split('|')) > 1 else str(size_dict[c]) for c in tmp_metadata[str(clonekey)]]
tmp_metadata[str(
clonekey)+'_by_size'] = tmp_metadata[str(clonekey)+'_by_size'].astype('category')
tmp_metadata = tmp_metadata[[str(clonekey), str(clonekey)+'_by_size'] + [
cl for cl in tmp_metadata if cl not in [str(clonekey), str(clonekey)+'_by_size']]]
for i in tmp_metadata.index:
if pd.notnull(tmp_metadata.loc[i, 'locus'+suffix_h]):
if pd.notnull(tmp_metadata.loc[i, 'locus'+suffix_l]):
if (tmp_metadata.loc[i, 'locus'+suffix_l] != ''):
tmp_metadata.at[i, 'status'] = tmp_metadata.loc[i,
'locus'+suffix_h]+' + ' + tmp_metadata.loc[i, 'locus'+suffix_l]
else:
tmp_metadata.at[i, 'status'] = tmp_metadata.loc[i,
'locus'+suffix_h] + '_only'
elif tmp_metadata.loc[i, 'locus'+suffix_h] != '':
tmp_metadata.at[i, 'status'] = tmp_metadata.loc[i,
'locus'+suffix_h] + '_only'
else:
tmp_metadata.at[i, 'status'] = 'unassigned'
else:
tmp_metadata.at[i, 'status'] = 'unassigned'
tmp_metadata['status_summary'] = [
'Multi' if '|' in i else i for i in tmp_metadata['status']]
for i in tmp_metadata.index:
if tmp_metadata.loc[i, 'productive'+suffix_h] == tmp_metadata.loc[i, 'productive'+suffix_h]:
if not pd.isnull(tmp_metadata.loc[i, 'productive'+suffix_l]):
if tmp_metadata.loc[i, 'productive'+suffix_l] != '':
tmp_metadata.at[i, 'productive'] = tmp_metadata.loc[i,
'productive'+suffix_h]+' + ' + tmp_metadata.loc[i, 'productive'+suffix_l]
else:
tmp_metadata.at[i, 'productive'] = tmp_metadata.loc[i,
'productive'+suffix_h]
elif tmp_metadata.loc[i, 'productive'+suffix_h] != '':
tmp_metadata.at[i, 'productive'] = tmp_metadata.loc[i,
'productive'+suffix_h]
else:
tmp_metadata.at[i, 'productive'] = 'unassigned'
else:
tmp_metadata.at[i, 'productive'] = 'unassigned'
tmp_metadata['productive_summary'] = [
'Multi' if '|' in i else i for i in tmp_metadata['productive']]
conversion_dict = {'igha1': 'IgA', 'igha2': 'IgA', 'ighm': 'IgM', 'ighd': 'IgD', 'ighe': 'IgE', 'ighg1': 'IgG', 'ighg2': 'IgG', 'ighg3': 'IgG', 'ighg4': 'IgG', 'igkc': 'IgK', 'iglc1': 'IgL', 'iglc2': 'IgL', 'iglc3': 'IgL', 'iglc4': 'IgL', 'iglc5': 'IgL', 'iglc6': 'IgL', 'iglc7': 'IgL',
'igha': 'IgA', 'ighg': 'IgG', 'iglc': 'IgL', 'nan': 'unassigned', 'na': 'unassigned', 'none': 'unassigned', '': 'unassigned', 'unassigned': 'unassigned', np.nan: 'unassigned', None: 'unassigned'}
isotype = []
for k in tmp_metadata['c_call'+suffix_h]:
if isinstance(k, str):
if ',' in k:
k = '|'.join(k.split(','))
if '|' in k:
isotype.append('|'.join([str(z) for z in [conversion_dict[y.lower()] for y in set(
[re.sub('[0-9]', '', x) for x in k.split('|')])]]))
else:
isotype.append(conversion_dict[k.lower()])
else:
isotype.append('unassigned')
tmp_metadata['isotype'] = isotype
tmp_metadata['isotype_summary'] = [i if i == 'IgM|IgD' or i == 'IgD|IgM' else
'Multi' if '|' in i else i for i in tmp_metadata['isotype']]
vdj_gene_calls = ['v_call', 'd_call', 'j_call']
if collapse_alleles:
for x in vdj_gene_calls:
if x in self.data:
for c in tmp_metadata:
if x in c:
tmp_metadata[c] = ['|'.join(['|'.join(list(set(yy.split(',')))) for yy in list(set(
[re.sub('[*][0-9][0-9]', '', tx) for tx in t.split('|')]))]) for t in tmp_metadata[c]]
multi = {}
multic = {}
for i in tmp_metadata.index:
try:
if 'v_call_genotyped' in cols:
hv_ = tmp_metadata.at[i,
'v_call_genotyped'+suffix_h].split('|')
else:
hv_ = tmp_metadata.at[i, 'v_call'+suffix_h].split('|')
except:
if 'v_call_genotyped' in cols:
hv_ = tmp_metadata.at[i, 'v_call_genotyped'+suffix_h]
else:
hv_ = tmp_metadata.at[i, 'v_call'+suffix_h]
try:
hj_ = tmp_metadata.at[i, 'j_call'+suffix_h].split('|')
except:
hj_ = tmp_metadata.at[i, 'j_call'+suffix_h]
try:
if 'v_call_genotyped' in cols:
lv_ = tmp_metadata.at[i,
'v_call_genotyped'+suffix_l].split('|')
else:
lv_ = tmp_metadata.at[i, 'v_call'+suffix_l].split('|')
except:
if 'v_call_genotyped' in cols:
lv_ = tmp_metadata.at[i, 'v_call_genotyped'+suffix_l]
else:
lv_ = tmp_metadata.at[i, 'v_call'+suffix_l]
try:
lj_ = tmp_metadata.at[i, 'j_call'+suffix_l].split('|')
except:
lj_ = tmp_metadata.at[i, 'j_call'+suffix_l]
try:
hc_ = tmp_metadata.at[i, 'c_call'+suffix_h].split('|')
except:
hc_ = tmp_metadata.at[i, 'c_call'+suffix_h]
try:
lc_ = tmp_metadata.at[i, 'c_call'+suffix_l].split('|')
except:
lc_ = tmp_metadata.at[i, 'c_call'+suffix_l]
multi_h = []
multi_l = []
multi_hc = []
multi_lc = []
if len(hv_) > 1:
multi_h.append(['Multi'+suffix_h+'_v'])
if len(hj_) > 1:
multi_h.append(['Multi'+suffix_h+'_j'])
if len(lv_) > 1:
multi_l.append(['Multi'+suffix_l+'_v'])
if len(lj_) > 1:
multi_l.append(['Multi'+suffix_l+'_j'])
if len(hc_) > 1:
if (tmp_metadata.at[i, 'isotype_summary'] == 'IgM|IgD') or (tmp_metadata.at[i, 'isotype_summary'] == 'IgD|IgM'):
multi_hc.append([tmp_metadata.at[i, 'isotype_summary']])
else:
multi_hc.append(['Multi'+suffix_h+'_c'])
if len(lc_) > 1:
multi_lc.append(['Multi'+suffix_l+'_c'])
if len(multi_hc) < 1:
multi_hc.append(['Single'])
if len(multi_h) < 1:
multi_h.append(['Single'])
if (len(lv_) == 1) & (len(lj_) == 1):
if ('' not in lv_) and ('' not in lj_):
if len(multi_l) < 1:
multi_l.append(['Single'])
if len(lc_) == 1:
if ('' not in lc_):
if len(multi_lc) < 1:
multi_lc.append(['Single'])
multih = '|'.join(list(set(flatten(multi_h))))
multil = '|'.join(list(set(flatten(multi_l))))
multihc = '|'.join(list(set(flatten(multi_hc))))
multilc = '|'.join(list(set(flatten(multi_lc))))
if len(multih) > 0:
if len(multil) > 0:
multi[i] = multih + ' + ' + multil
else:
multi[i] = multih
else:
multi[i] = 'unassigned'
if len(multihc) > 0:
if len(multilc) > 0:
multic[i] = multihc + ' + ' + multilc
else:
multic[i] = multihc
else:
multic[i] = 'unassigned'
tmp_metadata['vdj_status'] = pd.Series(multi)
tmp_metadata['vdj_status_summary'] = [
'Multi' if 'Multi'+suffix_h in i else 'Single' for i in tmp_metadata['vdj_status']]
tmp_metadata['heavychain_status_summary'] = [
'Multi' if 'Multi'+suffix_h in i else 'Single' for i in pd.Series(multic)]
self.metadata = tmp_metadata.copy()