import pandas as pd
import numpy as np
from anndata import AnnData
[docs]def scanpy_to_pycroc(AnnData, gene_list=[], obs_name=None , case_class=None, case_label=None):
"""
A function to convert a scanpy AnnData object into a pycroc data (both labelled and unlabelled). By specifying a gene list (if the genes are in AnnData.var_names), it subsets the AnnData object and it retreives their expression values. If a pycroc training dataset (labelled) is required, an AnnData.obs name and one or more obs categories to be set as case class must be specified.
Parameters
----------
AnnData: AnnData
a scanpy AnnData.
gene_list: list(str)
a list of gene names.
obs_name: str
name of the AnnData.obs to be used for labelling. Required if unclassified_data is False.
case_class: str or list(str)
a string or a list of strings that specifies one or more categories, of the obs selected with obs_name, to be set as case class. Required if unclassified_data is False.
case_label: str
a string that specifies the label to assign to observations belonging to case class. Required if unclassified_data is False.
Returns
-------
data (pandas.DataFrame): a properly formatted (labelled or unlabelled) pycroc object, which is analogue to the object returned by load_data().
"""
gene_list=[i for i in gene_list if i in AnnData.var_names]
gene_list= sorted(gene_list)
markers=pd.DataFrame(AnnData[:,gene_list].X, columns=gene_list)
IDs= pd.DataFrame()
IDs['ID']= AnnData.obs.index
if obs_name != None:
if type(case_class) == str:
IDs['Class'] = np.array(AnnData.obs[obs_name] == case_class)
IDs['Class'].replace(to_replace=[True, False], value=[case_label, 'Others'], inplace=True)
elif type(case_class) == list:
case = []
for n in case_class:
case.append(np.array(AnnData.obs[obs_name] == n))
IDs['Class'] = np.logical_or.reduce(case)
IDs['Class'].replace(to_replace=[True, False], value=[case_label, 'Others'], inplace=True)
else:
raise TypeError( "case_class must be a str or a list(str)")
data = pd.concat([IDs, markers], axis=1)
return data