Package ete2 :: Package coretype :: Module seqgroup
[hide private]
[frames] | no frames]

Source Code for Module ete2.coretype.seqgroup

  1  __VERSION__="ete2-2.0rev96"  
  2  # #START_LICENSE########################################################### 
  3  # 
  4  # Copyright (C) 2009 by Jaime Huerta Cepas. All rights reserved.   
  5  # email: jhcepas@gmail.com 
  6  # 
  7  # This file is part of the Environment for Tree Exploration program (ETE).  
  8  # http://ete.cgenomics.org 
  9  #   
 10  # ETE is free software: you can redistribute it and/or modify it 
 11  # under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation, either version 3 of the License, or 
 13  # (at your option) any later version. 
 14  #   
 15  # ETE is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19  #   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with ETE.  If not, see <http://www.gnu.org/licenses/>. 
 22  # 
 23  # #END_LICENSE############################################################# 
 24   
 25  """ 
 26  The 'seqgroup' module provides methods and classes to operate with 
 27  Multiple Sequence Files, including Multiple Sequence Alignments. 
 28   
 29  Currently, Fasta, Phylip sequencial and Phylip interleaved formats are 
 30  supported. 
 31  """ 
 32   
 33  from ete2.parser.fasta import read_fasta, write_fasta 
 34  from ete2.parser.phylip import read_phylip, write_phylip 
 35   
 36  __all__ = ["SeqGroup"] 
 37   
38 -class SeqGroup(object):
39 """ 40 SeqGroup class can be used to store a set of sequences (aligned 41 or not). 42 43 CONSTRUCTOR ARGUMENTS: 44 ====================== 45 46 * sequences: Path to the file containing the sequences or, 47 alternatively, the text string containing the same information. 48 49 * format (optional): the format in which sequences are encoded. Current 50 supported formats are: "fasta", "phylip" (phylip sequencial) 51 and "iphylip" (phylip interleaved) 52 53 RETURNS: 54 ======== 55 A SeqGroup object to operate with sequencies. 56 57 EXAMPLES: 58 ========= 59 msf = ">seq1\\nAAAAAAAAAAA\\n>seq2\\nTTTTTTTTTTTTT\\n" 60 seqs = SeqGroup(msf, format="fasta") 61 print seqs.get_seq("seq1") 62 """ 63
64 - def __len__(self):
65 return len(self.id2seq)
66
67 - def __contains__(self, item):
68 return item in self.name2id
69
70 - def __str__(self):
71 return write_fasta(self)
72
73 - def __iter__(self):
74 return self.iter_entries()
75
76 - def __init__(self, sequences = None , format="fasta"):
77 self.parsers = { 78 "fasta": [read_fasta, write_fasta, {}], 79 "phylip": [read_phylip, write_phylip, {"interleaved":False}], 80 "iphylip": [read_phylip, write_phylip, {"interleaved":True}] 81 } 82 83 self.id2name = {} 84 self.name2id = {} 85 self.id2comment= {} 86 self.id2seq = {} 87 88 if sequences is not None: 89 format = format.lower() 90 if format in self.parsers: 91 read = self.parsers[format][0] 92 args = self.parsers[format][2] 93 read(sequences, obj=self, **args) 94 else: 95 raise ValueError, "Unsupported format: [%s]" %format
96
97 - def write(self, format="fasta", outfile=None):
98 """ Returns the text representation of the sequences in the 99 supplied given format (default=FASTA). If "oufile" argument is 100 used, the result is written into the given path.""" 101 102 format = format.lower() 103 if format in self.parsers: 104 write = self.parsers[format][1] 105 args = self.parsers[format][2] 106 return write(self, outfile, **args) 107 else: 108 raise ValueError, "Unssupported format: [%s]" %format
109
110 - def iter_entries(self):
111 """ Returns an iterator over all sequences in the 112 collection. Each item is a tuple with the sequence name, 113 sequence, and sequence comments """ 114 for i, seq in self.id2seq.iteritems(): 115 yield self.id2name[i], seq, self.id2comment.get(i, [])
116
117 - def get_seq(self, name):
118 """ Returns the sequence associated to a given entry name.""" 119 return self.id2seq[self.name2id[name]]
120
121 - def get_entries(self):
122 """ Returns the list of entries currently stored.""" 123 keys = self.id2seq.keys() 124 seqs = self.id2seq.values() 125 comments = [self.id2comment.get(x, []) for x in keys] 126 names = map(lambda x: self.id2name[x], keys) 127 return zip(names, seqs, comments)
128
129 - def set_seq(self, name, seq, comments = []):
130 """Updates or creates the sequence of "name" """ 131 name = name.strip() 132 seq = seq.replace(" ", "") 133 seq = seq.replace("\t", "") 134 seq = seq.replace("\n", "") 135 seq = seq.replace("\r", "") 136 seqid = self.name2id.get(name, None) 137 if not seqid: 138 for i in xrange(len(self.id2seq)): 139 if i not in self.id2seq: 140 seqid = i 141 break 142 if not seqid: 143 seqid = i+1 144 145 self.name2id[name] = seqid 146 self.id2name[seqid] = name 147 self.id2comment[seqid] = comments 148 self.id2seq[seqid] = seq
149