1 __VERSION__="ete2-2.0rev89"
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 import os
26 import string
27 from sys import stderr as STDERR
28
30 """ Reads a collection of sequences econded in FASTA format."""
31
32 if obj is None:
33 from ete2.coretype import seqgroup
34 SC = seqgroup.SeqGroup()
35 else:
36 SC = obj
37
38 names = set([])
39 seq_id = -1
40
41
42 if os.path.isfile(source):
43 _source = open(source, "rU")
44 else:
45 _source = iter(source.split("\n"))
46
47 seq_name = None
48 for line in _source:
49 line = line.strip()
50 if line.startswith('#') or not line:
51 continue
52
53 elif line.startswith('>'):
54
55 if seq_id>-1 and SC.id2seq[seq_id] == "":
56 raise Exception, "No sequence found for "+seq_name
57
58 seq_id += 1
59
60 seq_header_fields = map(string.strip, line[1:].split("\t"))
61 seq_name = seq_header_fields[0]
62
63
64 if seq_name in names:
65 tag = str(len([k for k in SC.name2id.keys() if k.endswith(seq_name)]))
66 old_name = seq_name
67 seq_name = tag+"_"+seq_name
68 print >>STDERR, "Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name)
69
70
71 SC.id2seq[seq_id] = ""
72 SC.id2name[seq_id] = seq_name
73 SC.name2id[seq_name] = seq_id
74 SC.id2comment[seq_id] = seq_header_fields[1:]
75 names.add(seq_name)
76
77 else:
78 if seq_name is None:
79 raise Exception, "Error readind sequences: Wrong format."
80
81
82 s = line.strip().replace(" ","")
83
84
85 SC.id2seq[seq_id] += s
86
87 if seq_name and SC.id2seq[seq_id] == "":
88 print >>STDERR, seq_name,"has no sequence"
89 return None
90
91
92 return SC
93
94 -def write_fasta(sequences, outfile = None, seqwidth = 80):
95 """ Writes a SeqGroup python object using FASTA format. """
96
97 text = '\n'.join([">%s\n%s" %( "\t".join([name]+comment), _seq2str(seq)) for
98 name, seq, comment in sequences])
99
100 if outfile is not None:
101 OUT = open(outfile,"w")
102 OUT.write(text)
103 OUT.close()
104 else:
105 return text
106
108 sequence = ""
109 for i in xrange(0,len(seq),seqwidth):
110 sequence+= seq[i:i+seqwidth] + "\n"
111 return sequence
112