1 """
2 The only thing that should be used externally from this module is
3 I{TEMPLATES}, a C{dict} of C{FastaTemplate} instances. A dictionary is
4 used so that templates can be selected dynamically at run-time.
5
6 Template types registered in I{TEMPLATES} are:
7
8 - B{I{default}} - plain old fasta line
9 - B{name} - everything after the ">"
10 - B{I{swissprot}} - fasta files from swissprot
11 - B{gi_num} - between first set of "|"s
12 - B{accession} - between 3rd and 4th "|"
13 - B{description} - after last "|"
14 - B{I{pdb}} - the fasta file of the entire pdb
15 - B{idCode} - first four characters after ">"
16 - B{chainID} - any non-whitespace characters after first "_"
17 - B{type} - non-whitespace immediately following first ":"
18 - B{numRes} - numbers immediatly following first ":"
19 - B{description} - stripped characters after I{numRes}
20 - B{I{nr}} - the protein non-redundant database
21 - B{gi} - between first set of "|"s
22 - B{accession} - between 3rd and 4th "|"
23 - B{description} - stripped characters before brackets
24 - B{source} - stripped characters inside brackets
25 - B{I{nrblast}} - fasta file produced from blast output of the nr
26 - B{gi} - between first set of "|"s
27 - B{accession} - between 3rd and 4th "|"
28
29 @var TEMPLATES: a C{dict} holding instances of C{FastaTemplate}
30 used for parsing
31 """
32
33 import re
34
35
36
37
39 """
40 This class encapsulates template information for parsing fasta
41 files. Wraps a regular expression (I{regex}) used to parse the first
42 line of a fasta record and also a C{tuple} of C{str}ings (I{fields})
43 that name the information contained in the first line of the fasta
44 record.
45
46 @type regex: _sre.SRE_Pattern
47 @type fields: tuple
48 """
49
50
51
52
54 """
55 @param regex: the compiled C{_sre.SRE_Pattern} with which to
56 parse the file
57 @type regex: _sre.SRE_Pattern
58 @param fields: a C{tuple} of C{str}ings containing names of the
59 fields found by parsing the first line of the
60 fasta record
61 @type fields: tuple
62 """
63
64
65 self.regex = regex
66 self.fields = fields
67
68
69
70
71 - def match(self, astring):
72 """
73 Returns a C{_sre.SRE_Match} object describing the results of using
74 I{self._regex} to search I{string}.
75
76 @param astring: a string generally containing a line of the fasta
77 file being processed
78 @type astring: str
79
80 @return: C{_sre.SRE_Match} object describing the results of using
81 I{self._regex} to search I{string}
82 @rtype: _sre.SRE_Match
83 """
84 return self.regex.match(astring)
85
86
87
88
89
91 """
92 Sets the I{regex} property to I{rgx}, a C{_sre.SRE_Pattern}.
93
94 @param rgx: a compiled regular expression of the re module
95 @type rgx: _sre.SRE_Pattern
96 """
97 self._regex = rgx
98
99
100
101
103 """
104 Sets the I{fields} property to I{ary}, a C{tuple} of C{str}ings.
105
106 @param ary: a C{tuple} of C{str}ings naming the fields of the
107 type of fasta records
108 @type ary: tuple
109 """
110 self._fields = tuple(ary)
111
112
113
114
116 """
117 Returns the I{regex} property.
118
119 @return: the I{regex} property
120 @rtype: _sre.SRE_Pattern
121 """
122 return self._regex
123
124
125
126
128 """
129 Returns the I{fields} property.
130
131 @return: the I{fields} propery
132 @rtype: tuple
133 """
134 return self._fields
135
136
137
138
140 """
141 Given the C{int} index I{n}, return the field at that index.
142
143 @return: the name of the field in the I{fields} property at
144 the index I{n}
145 @rtype: str
146 """
147 return self.fields[n]
148
149
150
151
152 regex = property(get_regex, set_regex)
153 fields = property(get_fields, set_fields)
154
155
156
157
158
160 """
161 This C{class} is essentially a namespace to hold some values that
162 will be used to provide templates for the I{TEMPLATES} C{dict}.
163 """
164
165
166
168 """
169 Will raise a C{RuntimeError} if called.
170
171 @raise RuntimeError: raises a C{RuntimeError} under all
172 circumstances
173 """
174 raise RuntimeError, "This class can not be instantiated."
175
176
177
178
179 _default_regex = re.compile(r'^>\ *(.*)$')
180 _default_fields = ("name",)
181 _default_template = FastaTemplate( _default_regex,
182 _default_fields )
183
184
185
186 _swissprot_regex = re.compile(r'^>gi\|([^|]*)\|sp\|([^|]*)\|(.*)$')
187 _swissprot_fields = ("gi_num","accession","description")
188 _swissprot_template = FastaTemplate( _swissprot_regex,
189 _swissprot_fields )
190
191
192
193 _pdb_regex = re.compile(r'^>(....)_(\S*)\s+[^:]*:(\S*)\s+length:(\S*)\s+(\S*.*)$')
194 _pdb_fields = ("idCode", "chainID", "type", "numRes", "description")
195 _pdb_template = FastaTemplate(_pdb_regex, _pdb_fields)
196
197
198
199
200 _nr_regex = re.compile(r'^>gi\|([^|]*)\|[^|]*\|([^|]*)\|\s*([^\[]*)\s*\[([^\]]*)\]\s*$')
201 _nr_fields = ("gi", "accession", "description", "source")
202 _nr_template = FastaTemplate(_nr_regex, _nr_fields)
203
204
205
206
207 _nrblast_regex = re.compile(r'^>gi\|([^|]*)\|[^|]*\|([^|]*)\|.*$')
208 _nrblast_fields = ("gi", "accession")
209 _nrblast_template = FastaTemplate(_nrblast_regex, _nrblast_fields)
210
211
212
213
214
215 TEMPLATES = {
216 "default" : _t._default_template,
217 "swissprot" : _t._swissprot_template,
218 "pdb" : _t._pdb_template,
219 "nr" : _t._nr_template,
220 "nrblast" : _t._nrblast_template
221 }
222