1
2
3 """
4 A module that does stuff with fasta files.
5
6 The most useful methods are:
7
8 - B{L{fasta_parser}}: returns an iterator for a fasta file
9 - B{L{make_fasta_from_dict}}: returns a string representation
10 of a fasta file given a C{dict} of sequences
11 keyed by record name
12 - B{L{make_fasta}}: returns a string representation of a
13 fasta record given a sequence (as a C{str}) and
14 a name (also as a C{str})
15
16 See L{tfasta_templates} documentation for supported fasta file types.
17
18 @var FASTA_WIDTH: default width of fasta sequences
19 @type FASTA_WIDTH: int
20 """
21
22 import re
23 import sys
24 import cStringIO
25 from tfasta_templates import TEMPLATES
26
27 T_DEF = TEMPLATES['default']
28 T_SWISS = TEMPLATES['swissprot']
29 T_PDB = TEMPLATES['pdb']
30 T_NR = TEMPLATES['nr']
31 T_NRBLAST = TEMPLATES['nrblast']
32
33 FASTA_WIDTH = 60
34
35 CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-"
36
37
38
39
40
41 -def fasta_parser(filename,template=None,greedy=None,dogaps=False):
42 """
43 Given a I{filename}, returns an iterator that will iterate
44 over the fasta file. It will C{yield} dictionaries keyed according
45 to the C{fields} in C{template}. These dictionaries will all also
46 include a sequence keyed by "sequence". Yielding dictionaries
47 allows for flexibility in the types of fasta files parsed.
48
49 File format testing is not done, so make sure its a fasta file.
50
51 @param filename: name of the fasta file
52 @type filename: str
53 @param template: instance of C{FastaTemplate} class--choose from
54 TEMPLATES or define your own.
55 @type template: FastaTemplate
56 @param greedy: a C{bool} specifying whether to read the
57 whole fasta file in at once. Set to C{True} for many smaller
58 files or to C{False} for a few or one REALLY big ones.
59 @type greedy: bool
60 @param dogaps: a C{bool} specifying whether to keep "-" in the
61 sequence after parsing the file
62 - if C{False}, then gaps are ignored
63 - handy if processing an alignment
64 """
65
66 letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
67
68
69 if (template is None):
70 template = T_DEF
71
72 if (greedy is None):
73 greedy = False
74 if dogaps:
75 alphabet = letters + "-"
76 else:
77 alphabet = letters
78
79 fastafile = open(filename)
80
81 if (greedy):
82
83 afile = cStringIO.StringIO(fastafile.read())
84 fastafile.close()
85 fastafile = afile
86
87 found_first = False
88 fasta = []
89
90 while (True):
91 aline = fastafile.readline()
92
93 if (not aline):
94 if (found_first):
95 yield entry
96 break
97
98 fasta_match = template.match(aline)
99 if (fasta_match):
100 if (found_first):
101 yield entry
102 else:
103 found_first = True
104
105 entry = {"sequence" : ""}
106
107 index = 0
108
109 while (index < len(template.fields)):
110
111 entry[template.get_field(index)] = fasta_match.group(index+1)
112 index = index + 1
113 else:
114
115 if (found_first):
116 seqline = "".join([c.upper() for c in aline if c.upper() in alphabet])
117
118 entry["sequence"] = entry["sequence"] + seqline
119 else:
120
121 pass
122
123 fastafile.close()
124
125
126
127
128
130 """
131 Give it a C{dict} of sequences keyed by name of the sequence
132 and it returns a fasta representation as a C{str}.
133
134 @param adict: C{dict} of sequences keyed by name
135 @type adict: dict
136
137 @return: fasta representation of sequences as a C{str}
138 @rtype: str
139 """
140 fastastr = ""
141 names = adict.keys()
142 names.sort()
143 for aname in names:
144 aseq = adict[aname]
145 fastastr = "\n".join([fastastr, make_fasta(aname, aseq, width)])
146 return fastastr
147
148
149
150
151
153 """
154 Give it a sequence I{name} and a sequence (I{seq}) and it
155 returns a fasta representation as a C{str}.
156
157 @param name: name of sequence
158 @type name: str
159 @param seq: sequence as a C{str}
160 @type seq: str
161
162 @return: a string representation of a fasta record
163 @rtype: str
164 """
165 seq = "".join([c for c in seq if c in CHARS])
166 seq = seq.upper()
167 place = 0
168 fastaseq = ">" + str(name)
169 while place < len(seq):
170 char = seq[place]
171 if (place % width) == 0:
172 fastaseq = fastaseq + "\n"
173 fastaseq = fastaseq + char
174 place = place + 1
175 fastaseq = fastaseq
176 return fastaseq
177
178
179
180
181
182
184 """
185 Tests for proper construction of a parser using I{template} or
186 of the fasta file named I{filename}.
187
188 @param template: C{FastaTemplate} describing fasta record
189 @type template: FastaTemplate
190 @param filename: name of fasta file
191 @type filename: str
192
193 @raise Exception: raises C{Exception} if fasta file is malformed
194 or if the I{template} didn't work
195
196 @return: C{True} if it works.
197 @rtype: bool
198 """
199 newfasRE = re.compile(r'>')
200
201 afile = open(filename)
202 for aline in afile:
203 if newfasRE.search(aline):
204 if not template.match(aline):
205 afile.close()
206 raise Exception, "%s\n\nmalformed fasta file: '%s'" % \
207 (aline, filename)
208 afile.close()
209 return True
210
211
212
213
214
215
216
217
218
219
220
221
222 if (__name__ == "__main__"):
223
224 try:
225 fastaname = sys.argv[1]
226 fastype = sys.argv[2]
227 except:
228 print "usage: python _tfasta.py filename type"
229 sys.exit(0)
230
231 template = TEMPLATES[fastype]
232
233 test_parser(template, fastaname)
234
235 for entry in fasta_parser(fastaname, template=template, greedy=True):
236 print entry
237