Package ete2 :: Package coretype :: Module arraytable
[hide private]
[frames] | no frames]

Source Code for Module ete2.coretype.arraytable

  1  __VERSION__="ete2-2.0rev96"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25   
 26  import sys 
 27  import re 
 28  import math 
 29  from os import path 
 30   
 31  import numpy 
 32  from ete2.parser.text_arraytable import write_arraytable, read_arraytable 
 33   
 34  __all__ = ["ArrayTable"] 
 35   
36 -class ArrayTable(object):
37 """This object is thought to work with matrix datasets (like 38 microarrays). It allows to load the matrix an access easily to row 39 and column vectors. """ 40
41 - def __str__(self):
42 return str(self.matrix)
43
44 - def __init__(self, matrix_file=None, mtype="float"):
45 self.colNames = [] 46 self.rowNames = [] 47 self.colValues = {} 48 self.rowValues = {} 49 self.matrix = None 50 self.mtype = None 51 52 # If matrix file is supplied 53 if matrix_file is not None: 54 read_arraytable(matrix_file, \ 55 mtype=mtype, \ 56 arraytable_object = self)
57
58 - def get_row_vector(self,rowname):
59 """ Returns the vector associated to the given row name """ 60 return self.rowValues.get(rowname,None)
61 62
63 - def get_column_vector(self,colname):
64 """ Returns the vector associated to the given column name """ 65 return self.colValues.get(colname,None)
66 67
68 - def get_several_column_vectors(self,colnames):
69 """ Returns a list of vectors associated to several column names """ 70 vectors = [self.colValues[cname] for cname in colnames] 71 return numpy.array(vectors)
72
73 - def get_several_row_vectors(self,rownames):
74 """ Returns a list vectors associated to several row names """ 75 vectors = [self.rowValues[rname] for rname in rownames] 76 return numpy.array(vectors)
77
78 - def remove_column(self,colname):
79 """Removes the given column form the current dataset """ 80 col_value = self.colValues.pop(colname, None) 81 if col_value != None: 82 new_indexes = range(len(self.colNames)) 83 index = self.colNames.index(colname) 84 self.colNames.pop(index) 85 new_indexes.pop(index) 86 newmatrix = self.matrix.swapaxes(0,1) 87 newmatrix = newmatrix[new_indexes].swapaxes(0,1) 88 self._link_names2matrix(newmatrix)
89
90 - def merge_columns(self, groups, grouping_criterion):
91 """ Returns a new ArrayTable object in which columns are 92 merged according to a given criterion. 93 94 'groups' argument must be a dictionary in which keys are the 95 new column names, and each value is the list of current 96 column names to be merged. 97 98 'grouping_criterion' must be 'min', 'max' or 'mean', and 99 defines how numeric values will be merged. 100 101 Example: 102 my_groups = {'NewColumn':['column5', 'column6']} 103 new_Array = Array.merge_columns(my_groups, 'max') 104 105 """ 106 107 if grouping_criterion == "max": 108 grouping_f = get_max_vector 109 elif grouping_criterion == "min": 110 grouping_f = get_min_vector 111 elif grouping_criterion == "mean": 112 grouping_f = get_mean_vector 113 else: 114 raise ValueError, "grouping_criterion not supported. Use max|min|mean " 115 116 grouped_array = self.__class__() 117 grouped_matrix = [] 118 colNames = [] 119 alltnames = set([]) 120 for gname,tnames in groups.iteritems(): 121 all_vectors=[] 122 for tn in tnames: 123 if tn not in self.colValues: 124 raise ValueError, str(tn)+" column not found." 125 if tn in alltnames: 126 raise ValueError, str(tn)+" duplicated column name for merging" 127 alltnames.add(tn) 128 vector = self.get_column_vector(tn).astype(float) 129 all_vectors.append(vector) 130 # Store the group vector = max expression of all items in group 131 grouped_matrix.append(grouping_f(all_vectors)) 132 # store group name 133 colNames.append(gname) 134 135 for cname in self.colNames: 136 if cname not in alltnames: 137 grouped_matrix.append(self.get_column_vector(cname)) 138 colNames.append(cname) 139 140 grouped_array.rowNames= self.rowNames 141 grouped_array.colNames= colNames 142 vmatrix = numpy.array(grouped_matrix).transpose() 143 grouped_array._link_names2matrix(vmatrix) 144 return grouped_array
145
146 - def transpose(self):
147 """ Returns a new ArrayTable in which current matrix is transposed. """ 148 149 transposedA = self.__class__() 150 transposedM = self.matrix.transpose() 151 transposedA.colNames = list(self.rowNames) 152 transposedA.rowNames = list(self.colNames) 153 transposedA._link_names2matrix(transposedM) 154 155 # Check that everything is ok 156 # for n in self.colNames: 157 # print self.get_column_vector(n) == transposedA.get_row_vector(n) 158 # for n in self.rowNames: 159 # print self.get_row_vector(n) == transposedA.get_column_vector(n) 160 return transposedA
161 183
184 - def write(self, fname, colnames=[]):
185 write_arraytable(self, fname, colnames=colnames)
186 187 188
189 -def get_centroid_dist(vcenter,vlist,fdist):
190 d = 0.0 191 for v in vlist: 192 d += fdist(v,vcenter) 193 return 2*(d / len(vlist))
194
195 -def get_average_centroid_linkage_dist(vcenter1,vlist1,vcenter2,vlist2,fdist):
196 d1,d2 = 0.0, 0.0 197 for v in vlist1: 198 d1 += fdist(v,vcenter2) 199 for v in vlist2: 200 d2 += fdist(v,vcenter1) 201 return (d1+d2) / (len(vlist1)+len(vlist2))
202
203 -def safe_mean(values):
204 """ Returns mean value discarding non finite values """ 205 valid_values = [] 206 for v in values: 207 if numpy.isfinite(v): 208 valid_values.append(v) 209 return numpy.mean(valid_values), numpy.std(valid_values)
210
211 -def safe_mean_vector(vectors):
212 """ Returns mean profile discarding non finite values """ 213 # if only one vector, avg = itself 214 if len(vectors)==1: 215 return vectors[0], numpy.zeros(len(vectors[0])) 216 # Takes the vector length form the first item 217 length = len(vectors[0]) 218 219 safe_mean = [] 220 safe_std = [] 221 222 for pos in xrange(length): 223 pos_mean = [] 224 for v in vectors: 225 if numpy.isfinite(v[pos]): 226 pos_mean.append(v[pos]) 227 safe_mean.append(numpy.mean(pos_mean)) 228 safe_std.append(numpy.std(pos_mean)) 229 return safe_mean, safe_std
230
231 -def get_mean_vector(vlist):
232 a = numpy.array(vlist) 233 return numpy.mean(a,0)
234
235 -def get_median_vector(vlist):
236 a = numpy.array(vlist) 237 return numpy.median(a)
238
239 -def get_max_vector(vlist):
240 a = numpy.array(vlist) 241 return numpy.max(a,0)
242
243 -def get_min_vector(vlist):
244 a = numpy.array(vlist) 245 return numpy.min(a,0)
246