Source code for cobra.topology.reporter_metabolites

#cobra.topology.reporter_metabolites.py: Module for topological analysis of cobra_models
#Based on Patil et al 2005 PNAS 102:2685-9
#TODO: Validate cobra.core compliance
from copy import deepcopy
from numpy import array, corrcoef, mean, std, tril, where, unique, zeros
from scipy.stats import norm, randint
from collections import defaultdict

[docs]def identify_reporter_metabolites(cobra_model, reaction_scores_dict, number_of_randomizations=1000, number_of_layers=1, scoring_metric='default', score_type='p', entire_network=False, background_correction=True, ignore_external_boundary_reactions=False): """Calculate the aggregate Z-score for the metabolites in the model. Ignore reactions that are solely spontaneous or orphan. Allow the scores to have multiple columns / experiments. This will change the way the output is represented. cobra_model: A cobra.Model object TODO: CHANGE TO USING DICTIONARIES for the_reactions: the_scores reaction_scores_dict: A dictionary where the keys are reactions in cobra_model.reactions and the values are the scores. Currently, only supports a single numeric value as the value; however, this will be updated to allow for lists number_of_randomizations: Integer. Number of random shuffles of the scores to assess which are significant. number_of_layers: 1 is the only option supported scoring_metric: default means divide by k**0.5 score_type: 'p' Is the only option at the moment and indicates p-value. entire_network: Boolean. Currently, only compares scores calculated from the_reactions background_correction: Boolean. If True apply background correction to the aggreagate Z-score ignore_external_boundary_reactions: Not yet implemented. Boolean. If True do not count exchange reactions when calculating the score. """ #Add in a function to calculate based on correlation coefficients and to #deal with other multidimensional data. the_reactions = reaction_scores_dict.keys() the_scores = reaction_scores_dict.values() if score_type == 'p' and not hasattr(the_scores[0], '__iter__'): #minimum and maximum p-values are used to prevent numerical problems. #haven't decided whether an arbitrary min / max 1e-15 is preferred to #blunting the ends based on the values closest to 0 or 1. the_reactions = reaction_scores_dict.keys() the_scores = array(reaction_scores_dict.values()) minimum_p = min(the_scores[the_scores.nonzero()[0]]) maximum_p = max(the_scores[where(the_scores < 1)[0]]) the_scores[where(the_scores < minimum_p)] = minimum_p the_scores[where(the_scores > maximum_p)] = maximum_p the_scores = -norm.ppf(the_scores) #update the dictionary with the new scores reaction_scores_dict = dict(zip(the_reactions, the_scores)) elif hasattr(the_scores[0], '__iter__'): #In the case that the_scores is a list of lists, assume that each list is #the score for each reaction in the_reactions across all reactions. Then #for each metabolite, calculate the invnorm(|Pearson Correlation #Coefficient| for each reaction pair that it links. raise Exception("This isn't implemented yet") #Get the connectivity for each metabolite the_metabolites = set() [the_metabolites.update(x._metabolites) for x in reaction_scores_dict]; metabolite_scores = {} metabolite_connections = {} #Calculate the score for each metabolite for the_metabolite in the_metabolites: nonspontaneous_connections = [x for x in the_metabolite._reaction if x.gene_reaction_rule.lower() not in ['s0001', '']] tmp_score = 0 number_of_connections = len(nonspontaneous_connections) for the_reaction in nonspontaneous_connections: if the_reaction not in reaction_scores_dict: if not entire_network: number_of_connections -= 1 continue else: tmp_score += reaction_scores_dict[the_reaction] metabolite_scores[the_metabolite] = tmp_score metabolite_connections[the_metabolite] = number_of_connections #NOTE: Doing the corrections based only on the significantly perturbed scores #is probably going to underestimate the significance. if background_correction: correction_dict = {} for i in set(metabolite_connections.values()): #if entire_network # add in a section to deal with the situation where #the entire network structure is considered by only have p-values for #a limited subset. # #Basically, what we're doing here is that for each i we select i #scores number_of_randomizations times the_random_indices = randint.rvs(0,len(the_scores), size=(number_of_randomizations, i)) random_score_distribution = array([sum(the_scores[x]) for x in list(the_random_indices)]) /i**0.5 correction_dict[i] = [mean(random_score_distribution), std(random_score_distribution)] for the_metabolite, the_score in metabolite_scores.iteritems(): number_of_connections = metabolite_connections[the_metabolite] if number_of_connections > 0: #Correct based on background distribution if background_correction: #if the list of scores is only for significant perturbations then the #background correction shouldn't be applied because the current sampling #method only takes into account the_scores not the entire network. #It'd be more accurate to assign unscored reactions a default score. the_score = ((the_score / number_of_connections**.5) - correction_dict[number_of_connections][0]) / \ correction_dict[number_of_connections][1] else: the_score = the_score / number_of_connections**.5 #Update the score metabolite_scores[the_metabolite] = the_score return_dictionary = {'scores': metabolite_scores, 'connections': metabolite_connections} if background_correction: return_dictionary['corrections'] = correction_dict return(return_dictionary)
[docs]def ppmap_identify_reporter_metabolites(keywords): """ A function that receives a dict with all of the parameters for identify_reporter_metabolites Serves to make it possible to call the reporter metabolites function from ppmap. It only will be useful for parallel experiments not for breaking up a single experiment. """ the_results = identify_reporter_metabolites(**keywords) return({'id': the_id, 'results': the_results })
if __name__ == '__main__': from cPickle import load from time import time solver = 'glpk' from cobra.test import salmonella_pickle, salmonella_reaction_p_values_pickle with open(salmonella_pickle) as in_file: cobra_model = load(in_file) with open(salmonella_reaction_p_values_pickle) as in_file: reaction_p = load(in_file) the_reactions = map(cobra_model.reactions.get_by_id, reaction_p.keys()) the_scores = reaction_p.values() reaction_scores_dict = dict(zip(the_reactions, the_scores)) tmp_reps = identify_reporter_metabolites(cobra_model, reaction_scores_dict, background_correction=True) print 'Need to add in validation for the test'