Package tfasta :: Module _tfasta
[hide private]
[frames] | no frames]

Source Code for Module tfasta._tfasta

  1  #! /usr/bin/env python 
  2   
  3  """ 
  4  A module that does stuff with fasta files. 
  5   
  6  The most useful methods are: 
  7   
  8       - B{L{fasta_parser}}: returns an iterator for a fasta file 
  9       - B{L{make_fasta_from_dict}}: returns a string representation 
 10               of a fasta file given a C{dict} of sequences 
 11               keyed by record name 
 12       - B{L{make_fasta}}: returns a string representation of a 
 13               fasta record given a sequence (as a C{str}) and 
 14               a name (also as a C{str}) 
 15   
 16  See L{tfasta_templates} documentation for supported fasta file types. 
 17   
 18  @var FASTA_WIDTH: default width of fasta sequences 
 19  @type FASTA_WIDTH: int 
 20  """ 
 21   
 22  import re 
 23  import sys 
 24  import cStringIO 
 25  from tfasta_templates import TEMPLATES 
 26   
 27  T_DEF = TEMPLATES['default'] 
 28  T_SWISS = TEMPLATES['swissprot'] 
 29  T_PDB = TEMPLATES['pdb'] 
 30  T_NR = TEMPLATES['nr'] 
 31  T_NRBLAST = TEMPLATES['nrblast'] 
 32   
 33  FASTA_WIDTH = 60 
 34   
 35  CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-" 
 36   
 37   
 38  ####################################################################### 
 39  # fasta_parser() 
 40  ####################################################################### 
41 -def fasta_parser(filename,template=None,greedy=None,dogaps=False):
42 """ 43 Given a I{filename}, returns an iterator that will iterate 44 over the fasta file. It will C{yield} dictionaries keyed according 45 to the C{fields} in C{template}. These dictionaries will all also 46 include a sequence keyed by "sequence". Yielding dictionaries 47 allows for flexibility in the types of fasta files parsed. 48 49 File format testing is not done, so make sure its a fasta file. 50 51 @param filename: name of the fasta file 52 @type filename: str 53 @param template: instance of C{FastaTemplate} class--choose from 54 TEMPLATES or define your own. 55 @type template: FastaTemplate 56 @param greedy: a C{bool} specifying whether to read the 57 whole fasta file in at once. Set to C{True} for many smaller 58 files or to C{False} for a few or one REALLY big ones. 59 @type greedy: bool 60 @param dogaps: a C{bool} specifying whether to keep "-" in the 61 sequence after parsing the file 62 - if C{False}, then gaps are ignored 63 - handy if processing an alignment 64 """ 65 66 letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 67 68 # set the default template if necessary 69 if (template is None): 70 template = T_DEF 71 # be safe if greediness is not specified 72 if (greedy is None): 73 greedy = False 74 if dogaps: 75 alphabet = letters + "-" 76 else: 77 alphabet = letters 78 # open the fasta file 79 fastafile = open(filename) 80 # flag to for finding the first record 81 if (greedy): 82 # read the whole file and make a switch, no-one the wiser 83 afile = cStringIO.StringIO(fastafile.read()) 84 fastafile.close() 85 fastafile = afile 86 # set the flag to tells us if we have found a fasta entry 87 found_first = False 88 fasta = [] 89 # main loop 90 while (True): 91 aline = fastafile.readline() 92 # see if end of file 93 if (not aline): 94 if (found_first): 95 yield entry 96 break 97 # see if the line matches the supplied regex 98 fasta_match = template.match(aline) 99 if (fasta_match): 100 if (found_first): 101 yield entry 102 else: 103 found_first = True 104 # start a new entry because we found a match 105 entry = {"sequence" : ""} 106 # fill the hash described in template 107 index = 0 # which key (field) we are on 108 # iterate over all of the fields 109 while (index < len(template.fields)): 110 # match group 0 is entire match, so must add 1 111 entry[template.get_field(index)] = fasta_match.group(index+1) 112 index = index + 1 113 else: 114 # probably a sequence line 115 if (found_first): 116 seqline = "".join([c.upper() for c in aline if c.upper() in alphabet]) 117 # it is a sequence line because we must be in a record 118 entry["sequence"] = entry["sequence"] + seqline 119 else: 120 # file not true fasta format, keep going to find first record 121 pass 122 # clean up 123 fastafile.close()
124 125 126 ################################################################### 127 # make_fasta_from_dict() 128 ###################################################################
129 -def make_fasta_from_dict(adict, width=FASTA_WIDTH):
130 """ 131 Give it a C{dict} of sequences keyed by name of the sequence 132 and it returns a fasta representation as a C{str}. 133 134 @param adict: C{dict} of sequences keyed by name 135 @type adict: dict 136 137 @return: fasta representation of sequences as a C{str} 138 @rtype: str 139 """ 140 fastastr = "" 141 names = adict.keys() 142 names.sort() 143 for aname in names: 144 aseq = adict[aname] 145 fastastr = "\n".join([fastastr, make_fasta(aname, aseq, width)]) 146 return fastastr
147 148 149 ################################################################### 150 # make_fasta() 151 ###################################################################
152 -def make_fasta(name, seq, width=FASTA_WIDTH):
153 """ 154 Give it a sequence I{name} and a sequence (I{seq}) and it 155 returns a fasta representation as a C{str}. 156 157 @param name: name of sequence 158 @type name: str 159 @param seq: sequence as a C{str} 160 @type seq: str 161 162 @return: a string representation of a fasta record 163 @rtype: str 164 """ 165 seq = "".join([c for c in seq if c in CHARS]) 166 seq = seq.upper() 167 place = 0 168 fastaseq = ">" + str(name) 169 while place < len(seq): 170 char = seq[place] 171 if (place % width) == 0: 172 fastaseq = fastaseq + "\n" 173 fastaseq = fastaseq + char 174 place = place + 1 175 fastaseq = fastaseq 176 return fastaseq
177 178 179 180 ####################################################################### 181 # test_parser 182 #######################################################################
183 -def test_parser(template, filename):
184 """ 185 Tests for proper construction of a parser using I{template} or 186 of the fasta file named I{filename}. 187 188 @param template: C{FastaTemplate} describing fasta record 189 @type template: FastaTemplate 190 @param filename: name of fasta file 191 @type filename: str 192 193 @raise Exception: raises C{Exception} if fasta file is malformed 194 or if the I{template} didn't work 195 196 @return: C{True} if it works. 197 @rtype: bool 198 """ 199 newfasRE = re.compile(r'>') 200 201 afile = open(filename) 202 for aline in afile: 203 if newfasRE.search(aline): 204 if not template.match(aline): 205 afile.close() 206 raise Exception, "%s\n\nmalformed fasta file: '%s'" % \ 207 (aline, filename) 208 afile.close() 209 return True
210 211 212 213 214 215 ####################################################################### 216 ####################################################################### 217 ## 218 ## main 219 ## 220 ####################################################################### 221 ####################################################################### 222 if (__name__ == "__main__"): 223 224 try: 225 fastaname = sys.argv[1] 226 fastype = sys.argv[2] 227 except: 228 print "usage: python _tfasta.py filename type" 229 sys.exit(0) 230 231 template = TEMPLATES[fastype] 232 233 test_parser(template, fastaname) 234 235 for entry in fasta_parser(fastaname, template=template, greedy=True): 236 print entry 237