Package ete2 :: Package parser :: Module newick
[hide private]
[frames] | no frames]

Source Code for Module ete2.parser.newick

  1  __VERSION__="ete2-2.0rev96"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25  import re 
 26  import os 
 27   
 28  __all__ = ["read_newick", "write_newick", "print_supported_formats"] 
 29   
 30  # Regular expressions used for reading newick format 
 31  _ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r=" 
 32  _NHX_RE = "\[&&NHX:[^\]]*\]" 
 33  _FLOAT_RE = "[+-]?\d+\.?\d*" 
 34  _NAME_RE = "[^():,;\[\]]+" 
 35   
 36  DEFAULT_DIST = 1.0 
 37  DEFAULT_NAME = '' 
 38  DEFAULT_SUPPORT = 1.0 
 39   
 40   
 41  # Allowed formats. This table is used to read and write newick using 
 42  # different convenctions. You can also add your own formats in an easy way. 
 43  # 
 44  # 
 45  # FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\ 
 46  #    [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]] 
 47  # 
 48  # Attributes are placed in the newick as follows: 
 49  # 
 50  # .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ... 
 51  # 
 52  # 
 53  #           /-A 
 54  # -NoName--| 
 55  #          |          /-B 
 56  #           \C-------| 
 57  #                    |          /-D 
 58  #                     \E-------| 
 59  #                               \-G 
 60  # 
 61  # Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737); 
 62  # Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737); 
 63  # Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737); 
 64  # Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737); 
 65  # Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729))); 
 66  # Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737); 
 67  # Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C); 
 68  # Format 7 = (A,(B,(D,G)E)C); 
 69  # Format 8 = (A,(B,(D,G))); 
 70  # Format 9 = (,(,(,))); 
 71   
 72  NW_FORMAT = { 
 73    0: [['name', str, True],  ["dist", float, True],    ['support', float, True],   ["dist", float, True]], # Flexible with support 
 74    1: [['name', str, True],  ["dist", float, True],    ['name', str, True],      ["dist", float, True]], # Flexible with internal node names 
 75    2: [['name', str, False], ["dist", float, False],   ['support', float, False],  ["dist", float, False]],# Strict with support values 
 76    3: [['name', str, False], ["dist", float, False],   ['name', str, False],     ["dist", float, False]], # Strict with internal node names 
 77    4: [['name', str, False], ["dist", float, False],   [None, None, False],        [None, None, False]], 
 78    5: [['name', str, False], ["dist", float, False],   [None, None, False],        ["dist", float, False]], 
 79    6: [['name', str, False], [None, None, False],      [None, None, False],        ["dist", float, False]], 
 80    7: [['name', str, False], ["dist", float, False],   ["name", str, False],       [None, None, False]], 
 81    8: [['name', str, False], [None, None, False],      ["name", str, False],       [None, None, False]], 
 82    9: [['name', str, False], [None, None, False],      [None, None, False],        [None, None, False]], # Only topology with node names 
 83    100: [[None, None, False],  [None, None, False],      [None, None, False],        [None, None, False]] # Only Topology 
 84  } 
 85   
 86   
87 -def format_node(node, node_type, format):
88 if node_type == "leaf": 89 container1 = NW_FORMAT[format][0][0] 90 container2 = NW_FORMAT[format][1][0] 91 converterFn1 = NW_FORMAT[format][0][1] 92 converterFn2 = NW_FORMAT[format][1][1] 93 else: 94 container1 = NW_FORMAT[format][2][0] 95 container2 = NW_FORMAT[format][3][0] 96 converterFn1 = NW_FORMAT[format][2][1] 97 converterFn2 = NW_FORMAT[format][3][1] 98 99 if converterFn1 == str: 100 try: 101 FIRST_PART = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 102 str(getattr(node, container1))) 103 except (AttributeError, TypeError): 104 FIRST_PART = "?" 105 106 elif converterFn1 is None: 107 FIRST_PART = "" 108 else: 109 try: 110 FIRST_PART = "%0.6f" %(converterFn2(getattr(node, container1))) 111 except (ValueError, TypeError): 112 FIRST_PART = "?" 113 114 115 if converterFn2 == str: 116 try: 117 SECOND_PART = ":"+re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 118 str(getattr(node, container2))) 119 except (ValueError, TypeError): 120 SECOND_PART = ":?" 121 elif converterFn2 is None: 122 SECOND_PART = "" 123 else: 124 try: 125 SECOND_PART = ":%0.6f" %(converterFn2(getattr(node, container2))) 126 except (ValueError, TypeError): 127 SECOND_PART = ":?" 128 129 return "%s%s" %(FIRST_PART, SECOND_PART)
130 131 # Used to write into specific formats
132 -def node2leafformat(node, format):
133 safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 134 str(getattr(node, "name"))) 135 136 if format == 0 or format == 1 or format == 2 or format ==3: 137 return "%s:%0.6f" %(safe_name, node.dist) 138 elif format == 4 or format == 7: 139 return ":%0.6f" %(node.dist) 140 elif format == 5 or format == 6: 141 return "%s" %(safe_name)
142
143 -def node2internalformat(node, format):
144 safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 145 str(getattr(node, "name"))) 146 if format == 0 or format == 1: 147 return "%0.6f:%0.6f" %(node.support, node.dist) 148 elif format == 2: 149 return "%s:%0.6f" %(safe_name, node.dist) 150 elif format == 3 or format == 4: 151 return ":%0.6f" %(node.dist) 152 elif format == 5: 153 return "%s" %(safe_name) 154 elif format == 6 or format == 7: 155 return ""
156 164
165 -class NewickError(Exception):
166 """Exception class designed for NewickIO errors.""" 167 pass
168
169 -def read_newick(newick, root_node=None, format=0):
170 """ Reads a newick tree from either a string or a file, and returns 171 an ETE tree structure. 172 173 A previously existent node object can be passed as the root of the 174 tree, which means that all its new children will belong to the same 175 class as the root(This allows to work with custom TreeNode 176 objects). 177 178 You can also take advantage from this behaviour to concatenate 179 several tree structures. 180 """ 181 182 if root_node is None: 183 from ete2.coretype.tree import TreeNode 184 root_node = TreeNode() 185 186 if type(newick) == str: 187 188 if os.path.exists(newick): 189 nw = open(newick, 'rU').read() 190 else: 191 nw = newick 192 nw = nw.strip() 193 if not nw.startswith('(') or not nw.endswith(';'): 194 raise NewickError, \ 195 'Unexisting tree file or Malformed newick tree structure.' 196 return _read_newick_from_string(nw, root_node, format) 197 else: 198 raise NewickError, \ 199 "'newick' argument must be either a filename or a newick string."
200
201 -def _read_newick_from_string(nw, root_node, format):
202 """ Reads a newick string in the New Hampshire format. """ 203 204 if nw.count('(') != nw.count(')'): 205 raise NewickError, 'Parentheses do not match. Broken tree structure' 206 207 # white spaces and separators are removed 208 nw = re.sub("\n", "", nw) 209 nw = re.sub("\r", "", nw) 210 nw = re.sub("\t", "", nw) 211 212 current_parent = None 213 214 215 # Ok, this is my own way of reading newick structures. I find it 216 # more flexible and elegant than other docummented methods. Don't 217 # know if I'm loosing much efficiency. It Starts by splitting the 218 # structure using open parentheses. Each of the resulting chunks 219 # represent an internal node. So for each chunk I create a new node 220 # that hungs from the current parent node. Each internal node chunk 221 # may contain information about terminal nodes hanging from the 222 # internal and clossing parenthessis (closing previously opened 223 # internal nodes). 224 # 225 # Enjoy. 226 # by JHC ;) 227 228 # Skip the first chunk. It is always == '' 229 for internal_node in nw.split("(")[1:]: 230 # If this is the root of tree, use the root_node instead of 231 # creating it, otherwise make a new one. 232 if current_parent is None: 233 current_parent = root_node 234 else: 235 current_parent = current_parent.add_child() 236 # We can only find leaf nodes within this chunk, since rest of 237 # internal nodes will be in the next newick chunks 238 possible_leaves = internal_node.split(",") 239 for i, leaf in enumerate(possible_leaves): 240 # Any resulting sub-chunk resulting from splitting by commas can 241 # be considered (tpologically) as a child to the current parent 242 # node. We only discard chunks if they are empty and in the last 243 # possition, meaining that the next brother is not terminal bu 244 # internal node (will be visited in the next newick chunk) 245 if leaf.strip() == '' and i == len(possible_leaves)-1: 246 continue 247 # Leaf text strings may end with a variable number of clossing 248 # parenthesis. For each ')' we read the information of the 249 # current node, close it and go up one more node. 250 clossing_nodes = leaf.split(")") 251 # first par contain leaf info 252 _read_node_data(clossing_nodes[0], current_parent, "leaf", format) 253 # The next parts containg clossing nodes and info about the 254 # internal nodes. 255 if len(clossing_nodes)>1: 256 for closing_internal in clossing_nodes[1:]: 257 if closing_internal.strip() ==";": continue 258 _read_node_data(closing_internal, current_parent, "internal", format) 259 current_parent = current_parent.up 260 return root_node
261
262 -def _parse_extra_features(node, NHX_string):
263 """ Reads node's extra data form its NHX string. NHX uses this 264 format: [&&NHX:prop1=value1:prop2=value2] """ 265 NHX_string = NHX_string.replace("[&&NHX:", "") 266 NHX_string = NHX_string.replace("]", "") 267 for field in NHX_string.split(":"): 268 try: 269 pname, pvalue = field.split("=") 270 except ValueError, e: 271 print NHX_string, field.split("=") 272 raise ValueError, e 273 node.add_feature(pname, pvalue)
274
275 -def _read_node_data(subnw, current_node, node_type,format):
276 """ Reads a leaf node from a subpart of the original newick 277 tree """ 278 279 if node_type == "leaf": 280 node = current_node.add_child() 281 container1 = NW_FORMAT[format][0][0] 282 container2 = NW_FORMAT[format][1][0] 283 converterFn1 = NW_FORMAT[format][0][1] 284 converterFn2 = NW_FORMAT[format][1][1] 285 flexible1 = NW_FORMAT[format][0][2] 286 flexible2 = NW_FORMAT[format][1][2] 287 else: 288 node = current_node 289 container1 = NW_FORMAT[format][2][0] 290 container2 = NW_FORMAT[format][3][0] 291 converterFn1 = NW_FORMAT[format][2][1] 292 converterFn2 = NW_FORMAT[format][3][1] 293 flexible1 = NW_FORMAT[format][2][2] 294 flexible2 = NW_FORMAT[format][3][2] 295 296 if converterFn1 == str: 297 FIRST_MATCH = "("+_NAME_RE+")" 298 elif converterFn1 == float: 299 FIRST_MATCH = "("+_FLOAT_RE+")" 300 elif converterFn1 is None: 301 FIRST_MATCH = '()' 302 303 if converterFn2 == str: 304 SECOND_MATCH = "(:"+_NAME_RE+")" 305 elif converterFn2 == float: 306 SECOND_MATCH = "(:"+_FLOAT_RE+")" 307 elif converterFn2 is None: 308 SECOND_MATCH = '()' 309 310 if flexible1: 311 FIRST_MATCH += "?" 312 if flexible2: 313 SECOND_MATCH += "?" 314 315 MATCH = '%s\s*%s\s*(%s)?' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE) 316 data = re.match(MATCH, subnw) 317 if data: 318 data = data.groups() 319 if data[0] is not None and data[0] != '': 320 node.add_feature(container1, converterFn1(data[0].strip())) 321 322 if data[1] is not None and data[1] != '': 323 node.add_feature(container2, converterFn2(data[1][1:].strip())) 324 325 if data[2] is not None \ 326 and data[2].startswith("[&&NHX"): 327 _parse_extra_features(node, data[2]) 328 else: 329 raise NewickError, "Unexpected leaf node format:\n\t"+ subnw[0:50] 330 return
331
332 -def write_newick(node, features=[], format=1, _is_root=True):
333 """ Recursively reads a tree structure and returns its NHX 334 representation. """ 335 newick = "" 336 if not node.children: 337 safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 338 str(getattr(node, "name"))) 339 340 newick += format_node(node, "leaf", format) 341 newick += _get_features_string(node, features) 342 return newick 343 else: 344 if node.children: 345 newick+= "(" 346 for cnode in node.children: 347 newick += write_newick(cnode, features, format=format,\ 348 _is_root = False) 349 # After last child is processed, add closing string 350 if cnode == node.children[-1]: 351 newick += ")" 352 if node.up is not None: 353 newick += format_node(node, "internal", format) 354 newick += _get_features_string(node, features) 355 else: 356 newick += ',' 357 if _is_root: 358 newick += ";" 359 return newick
360 361
362 -def _get_features_string(self, features=[]):
363 """ Generates the extended newick string NHX with extra data about 364 a node. """ 365 string = "" 366 if features is None: 367 features = [] 368 elif features == []: 369 features = self.features 370 371 for pr in features: 372 if hasattr(self, pr): 373 value = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ 374 str(getattr(self, pr))) 375 if string != "": 376 string +=":" 377 string +="%s=%s" %(pr, str(value)) 378 if string != "": 379 string = "[&&NHX:"+string+"]" 380 381 return string
382