1
2
3 """
4 tfasta: reading and writing of fast files
5
6 This is automatically generated documentation and should
7 not be relied on for the API. Please
8 see the official documentation at http://pythonhosted.org/tfasta/.
9
10 The most useful functions are:
11
12 - B{L{fasta_parser}}: returns an iterator for a fasta file
13 - B{L{string_fasta_parser}}: returns an iterator for fasta text
14 - B{L{io_fasta_parser}}: returns an iterator for fasta text
15 - B{L{make_fasta_from_dict}}: returns a string representation
16 of a fasta file given a C{dict} of sequences
17 keyed by record name
18 - B{L{make_fasta}}: returns a string representation of a
19 fasta record given a sequence (as a C{str}) and
20 a name (also as a C{str})
21
22 See L{tfasta_templates} documentation for supported fasta file types.
23
24 @var FASTA_WIDTH: default width of fasta sequences
25 @type FASTA_WIDTH: int
26 """
27
28 import re
29 import sys
30 import cStringIO
31 from tfasta_templates import FastaTemplate, TEMPLATES
32
33 T_DEF = TEMPLATES['default']
34 T_SWISS = TEMPLATES['swissprot']
35 T_PDB = TEMPLATES['pdb']
36 T_NR = TEMPLATES['nr']
37 T_NRBLAST = TEMPLATES['nrblast']
38
39 FASTA_WIDTH = 60
40
41 CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-"
42
43
44
45
47 """
48 Helper generator function for L{fasta_parser} and
49 L{string_fasta_parser}.
50
51 Given I{fastafile} (C{file}-like object, open for reading),
52 returns an iterator that iterates over the fasta file.
53 It will C{yield} dictionaries keyed according
54 to the C{fields} in C{template}. These dictionaries will all also
55 include a sequence keyed by "sequence".
56
57 @param fastafile: C{file}-like object containing fasta text,
58 opened for reading
59 @param template: instance of C{FastaTemplate} class--choose from
60 TEMPLATES or define your own.
61 @type template: FastaTemplate
62 @param dogaps: a C{bool} specifying whether to keep "-" in the
63 sequence after parsing the file
64 - if C{False}, then gaps are ignored
65 - handy if processing an alignment
66 """
67 letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
68
69
70 if (template is None):
71 template = T_DEF
72
73 if dogaps:
74 alphabet = letters + "-"
75 else:
76 alphabet = letters
77
78 found_first = False
79 fasta = []
80
81 while (True):
82 aline = fastafile.readline()
83
84 if (not aline):
85 if (found_first):
86 yield entry
87 break
88
89 fasta_match = template.match(aline)
90 if (fasta_match):
91 if (found_first):
92 yield entry
93 else:
94 found_first = True
95
96 entry = {"sequence" : ""}
97
98 index = 0
99
100 while (index < len(template.fields)):
101
102 entry[template.get_field(index)] = fasta_match.group(index+1)
103 index = index + 1
104 else:
105
106 if (found_first):
107 seqline = "".join([c.upper() for c in aline if c.upper() in alphabet])
108
109 entry["sequence"] = entry["sequence"] + seqline
110 else:
111
112 pass
113
114 fastafile.close()
115
116
117
118
119
120 -def fasta_parser(filename,template=None,greedy=None,dogaps=False):
121 """
122 Given a I{filename}, returns an iterator that iterates
123 over the fasta file. It will C{yield} dictionaries keyed according
124 to the C{fields} in C{template}. These dictionaries will all also
125 include a sequence keyed by "sequence". Yielding dictionaries
126 allows for flexibility in the types of fasta files parsed.
127
128 File format testing is not done, so make sure its a fasta file.
129
130 @param filename: name of the fasta file
131 @type filename: str
132 @param template: instance of C{FastaTemplate} class--choose from
133 TEMPLATES or define your own.
134 @type template: FastaTemplate
135 @param greedy: a C{bool} specifying whether to read the
136 whole fasta file in at once. Set to C{True} for many smaller
137 files or to C{False} for a few or one REALLY big ones.
138 @type greedy: bool
139 @param dogaps: a C{bool} specifying whether to keep "-" in the
140 sequence after parsing the file
141 - if C{False}, then gaps are ignored
142 - handy if processing an alignment
143 """
144
145 if (greedy is None):
146 greedy = False
147
148 fastafile = open(filename)
149
150 if (greedy):
151
152 afile = cStringIO.StringIO(fastafile.read())
153 fastafile.close()
154 fastafile = afile
155 return io_fasta_parser(fastafile, template, dogaps)
156
157
158
159
161 """
162 Given I{astr} (string of fasta), returns an iterator that iterates
163 over the fasta file. It will C{yield} dictionaries keyed according
164 to the C{fields} in C{template}. These dictionaries will all also
165 include a sequence keyed by "sequence". Yielding dictionaries
166 allows for flexibility in the types of fasta files parsed.
167
168 This function will do its best to remove unneeded whitespace,
169 including line breaks.
170
171 Beyond simple extra whitespace, the `astr` should be properly
172 formatted fasta text.
173
174 @param astr: fasta text
175 @type astr: str
176 @param template: instance of C{FastaTemplate} class--choose from
177 TEMPLATES or define your own.
178 @type template: FastaTemplate
179 @param dogaps: a C{bool} specifying whether to keep "-" in the
180 sequence after parsing the file
181 - if C{False}, then gaps are ignored
182 - handy if processing an alignment
183 """
184 astr = "\n".join([s.strip() for s in astr.splitlines() if s.strip()])
185 fastafile = cStringIO.StringIO(astr)
186 return io_fasta_parser(fastafile, template, dogaps)
187
188
189
190
192 """
193 Give it a C{dict} of sequences keyed by name of the sequence
194 and it returns a fasta representation as a C{str}.
195
196 @param adict: C{dict} of sequences keyed by name
197 @type adict: dict
198
199 @return: fasta representation of sequences as a C{str}
200 @rtype: str
201 """
202 fastastr = ""
203 names = adict.keys()
204 names.sort()
205 for aname in names:
206 aseq = adict[aname]
207 fastastr = "\n".join([fastastr, make_fasta(aname, aseq, width)])
208 return fastastr
209
210
211
212
213
215 """
216 Give it a sequence I{name} and a sequence (I{seq}) and it
217 returns a fasta representation as a C{str}.
218
219 @param name: name of sequence
220 @type name: str
221 @param seq: sequence as a C{str}
222 @type seq: str
223
224 @return: a string representation of a fasta record
225 @rtype: str
226 """
227 seq = "".join([c for c in seq if c in CHARS])
228 seq = seq.upper()
229 place = 0
230 fastaseq = ">" + str(name)
231 while place < len(seq):
232 char = seq[place]
233 if (place % width) == 0:
234 fastaseq = fastaseq + "\n"
235 fastaseq = fastaseq + char
236 place = place + 1
237 fastaseq = fastaseq
238 return fastaseq
239
240
241
242
243
244
246 """
247 Tests for proper construction of a parser using I{template} or
248 of the fasta file named I{filename}.
249
250 @param template: C{FastaTemplate} describing fasta record
251 @type template: FastaTemplate
252 @param filename: name of fasta file
253 @type filename: str
254
255 @raise Exception: raises C{Exception} if fasta file is malformed
256 or if the I{template} didn't work
257
258 @return: C{True} if it works.
259 @rtype: bool
260 """
261 newfasRE = re.compile(r'>')
262
263 afile = open(filename)
264 for aline in afile:
265 if newfasRE.search(aline):
266 if not template.match(aline):
267 afile.close()
268 raise Exception, "%s\n\nmalformed fasta file: '%s'" % \
269 (aline, filename)
270 afile.close()
271 return True
272