Package csb :: Package bio :: Package io :: Module fasta
[frames] | no frames]

Source Code for Module csb.bio.io.fasta

  1  """ 
  2  FASTA format sequence I/O. 
  3   
  4  This module provides parsers and writers for sequences and alignments in 
  5  FASTA format. The most basic usage is: 
  6   
  7      >>> parser = SequenceParser() 
  8      >>> parser.parse_file('sequences.fa') 
  9      <SequenceCollection>   # collection of L{AbstractSequence}s 
 10   
 11  This will load all sequences in memory. If you are parsing a huge file, 
 12  then you could efficiently read the file sequence by sequence: 
 13   
 14      >>> for seq in parser.read('sequences.fa'): 
 15              ...            # seq is an L{AbstractSequence} 
 16               
 17  L{BaseSequenceParser} is the central class in this module, which defines a 
 18  common infrastructure for all sequence readers. L{SequenceParser} is a standard 
 19  implementation, and L{PDBSequenceParser} is specialized to read FASTA sequences 
 20  with PDB headers. 
 21   
 22  For parsing alignments, have a look at L{SequenceAlignmentReader} and 
 23  L{StructureAlignmentFactory}. 
 24   
 25  Finally, this module provides a number of L{OutputBuilder}s, which know how to 
 26  write L{AbstractSequence} and L{AbstractAlignment} objects to FASTA files: 
 27   
 28      >>> with open('file.fa', 'w') as out: 
 29              builder = OutputBuilder.create(AlignmentFormats.FASTA, out) 
 30              builder.add_alignment(alignment) 
 31              builder.add_sequence(sequence) 
 32              ... 
 33   
 34  or you could instantiate any of the L{OutputBuilder}s directly. 
 35  """ 
 36   
 37  import csb.io 
 38  import csb.core 
 39   
 40  from abc import ABCMeta, abstractmethod 
 41   
 42  from csb.bio.sequence import SequenceTypes, SequenceAlphabets, AlignmentFormats, SequenceError 
 43  from csb.bio.sequence import SequenceAlignment, StructureAlignment, A3MAlignment 
 44  from csb.bio.sequence import SequenceCollection, AbstractSequence, Sequence, RichSequence, ChainSequence 
45 46 47 -class BaseSequenceParser(object):
48 """ 49 FASTA parser template. Subclasses must implement the way FASTA strings are 50 handled by overriding C{BaseSequenceParser.read_sequence}. 51 52 @param product: sequence product factory (an L{AbstractSequence} subclass) 53 @type product: type 54 @param product_type: default L{SequenceTypes} member for the products 55 @type product_type: L{EnumItem} 56 """ 57 __metaclass__ = ABCMeta 58
59 - def __init__(self, product=Sequence, product_type=SequenceTypes.Protein):
60 61 self._product = None 62 63 if not issubclass(product, AbstractSequence): 64 raise TypeError(product) 65 66 if not product_type.enum is SequenceTypes: 67 raise TypeError(product_type) 68 69 self._product = product 70 self._type = product_type
71 72 @property
73 - def product_factory(self):
74 """ 75 Factory used to build sequence products 76 @rtype: class 77 """ 78 return self._product
79 80 @property
81 - def product_type(self):
82 """ 83 Default sequence type of the products - a member of L{SequenceTypes} 84 @rtype: enum item 85 """ 86 return self._type
87 88 @abstractmethod
89 - def read_sequence(self, string):
90 """ 91 Parse a single FASTA string 92 93 @return: a new sequence, created with L{BaseSequenceParser.product_factory} 94 @rtype: L{AbstractSequence} 95 """ 96 pass
97
98 - def parse_string(self, fasta_string):
99 """ 100 Read FASTA sequences from an (m)FASTA-formatted string 101 102 @param fasta_string: FASTA string to parse 103 @type fasta_string: str 104 105 @return: a list of L{Sequence}s 106 @rtype: L{SequenceCollection} 107 """ 108 109 stream = csb.io.MemoryStream() 110 stream.write(fasta_string) 111 112 return self.parse_file(stream)
113
114 - def parse_file(self, fasta_file):
115 """ 116 Read FASTA sequences from a (m)FASTA file 117 118 @param fasta_file: input FASTA file name or opened stream 119 @type fasta_file: str, file 120 121 @return: a list of L{Sequence}s 122 @rtype: L{SequenceCollection} 123 """ 124 if isinstance(fasta_file, csb.core.string): 125 stream = open(fasta_file) 126 else: 127 stream = fasta_file 128 129 seqs = [] 130 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None) 131 132 for entry in reader.entries(): 133 seqs.append(self.read_sequence(entry)) 134 135 return SequenceCollection(seqs)
136
137 - def read(self, fasta_file):
138 """ 139 Read FASTA sequences from an (m)FASTA file. 140 141 @param fasta_file: input FASTA file name or opened stream 142 @type fasta_file: str, file 143 144 @return: efficient cursor over all L{Sequence}s (parse on demand) 145 @rtype: iterator 146 """ 147 if isinstance(fasta_file, csb.core.string): 148 stream = open(fasta_file) 149 else: 150 stream = fasta_file 151 152 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None) 153 154 for entry in reader.entries(): 155 yield self.read_sequence(entry)
156
157 -class SequenceParser(BaseSequenceParser):
158 """ 159 Standard FASTA parser. See L{BaseSequenceParser} for details. 160 """ 161
162 - def read_sequence(self, string):
163 164 lines = string.strip().splitlines() 165 166 if not lines[0].startswith(AbstractSequence.DELIMITER): 167 lines = [''] + lines 168 if len(lines) < 2: 169 raise ValueError('Empty FASTA entry') 170 171 header = lines[0] 172 id = header[1:].split()[0] 173 sequence = ''.join(lines[1:]) 174 175 return self.product_factory(id, header, sequence, self.product_type)
176
177 -class PDBSequenceParser(SequenceParser):
178 """ 179 PDB FASTA parser. Reads the PDB ID and sequence type from the header. 180 See L{BaseSequenceParser} for more details. 181 """ 182
183 - def read_sequence(self, string):
184 185 seq = super(PDBSequenceParser, self).read_sequence(string) 186 187 if not (seq.header and seq.id) or not (len(seq.id) in(5, 6) and seq.header.find('mol:') != -1): 188 raise ValueError('Does not look like a PDB header: {0}'.format(seq.header)) 189 190 seq.id = seq.id.replace('_', '') 191 stype = seq.header.partition('mol:')[2].partition(' ')[0] 192 try: 193 seq.type = csb.core.Enum.parse(SequenceTypes, stype) 194 except csb.core.EnumValueError: 195 seq.type = SequenceTypes.Unknown 196 197 return seq
198
199 200 -class A3MSequenceIterator(object):
201
202 - def __init__(self, sequences, insertion):
203 204 self._temp = [] 205 self._insertion = insertion 206 207 for sequence in sequences: 208 209 row = list(sequence.sequence) 210 row.reverse() 211 self._temp.append(row)
212
213 - def next(self):
214 215 sequences = self._temp 216 217 column = [ ] 218 has_insertion = False 219 220 for sequence in sequences: 221 if len(sequence) > 0 and sequence[-1].islower(): 222 has_insertion = True 223 224 for sequence in sequences: 225 try: 226 if has_insertion and not sequence[-1].islower(): 227 column.append(self._insertion) 228 else: 229 column.append(sequence.pop()) 230 except IndexError: 231 column.append('') 232 233 if not any(column): 234 raise StopIteration() 235 236 return column
237
238 - def __iter__(self):
239 return self
240
241 - def __next__(self):
242 return self.next()
243
244 -class SequenceAlignmentReader(object):
245 """ 246 Sequence alignment parser. 247 248 @param product_type: default L{SequenceTypes} member for the sequence products 249 @type product_type: L{EnumItem} 250 @param strict: if True, raise exception on duplicate sequence identifiers. 251 See L{csb.bio.sequence.AbstractAlignment} for details 252 @type strict: bool 253 """ 254
255 - def __init__(self, product_type=SequenceTypes.Protein, strict=True):
256 257 if not product_type.enum is SequenceTypes: 258 raise TypeError(product_type) 259 260 self._type = product_type 261 self._strict = bool(strict)
262 263 @property
264 - def product_type(self):
265 """ 266 Default sequence type of the alignment entries - a member of L{SequenceTypes} 267 @rtype: enum item 268 """ 269 return self._type
270 271 @property
272 - def strict(self):
273 """ 274 True if strict mode is enabled 275 @rtype: bool 276 """ 277 return self._strict
278
279 - def read_fasta(self, string):
280 """ 281 Parse an alignment in multi-FASTA format. 282 283 @param string: alignment string 284 @type string: str 285 286 @rtype: L{SequenceAlignment} 287 """ 288 289 parser = SequenceParser(RichSequence, self.product_type) 290 sequences = parser.parse_string(string) 291 292 return SequenceAlignment(sequences, strict=self.strict)
293
294 - def read_a3m(self, string):
295 """ 296 Parse an alignment in A3M format. 297 298 @param string: alignment string 299 @type string: str 300 301 @rtype: L{A3MAlignment} 302 """ 303 alphabet = SequenceAlphabets.get(self.product_type) 304 305 # parse all "mis-aligned" sequences as case-sensitive strings 306 parser = SequenceParser(Sequence, self.product_type) 307 sequences = parser.parse_string(string) 308 309 # storage for expanded sequences 310 s = [] 311 312 for dummy in sequences: 313 s.append([]) 314 315 # expand all sequences with insertion characters and make them equal length 316 for column in A3MSequenceIterator(sequences, str(alphabet.INSERTION)): 317 for sn, char in enumerate(column): 318 s[sn].append(char) 319 320 # build normal sequence objects from the equalized sequence strings 321 aligned_seqs = [] 322 323 for sn, seq in enumerate(sequences): 324 325 sequence = RichSequence(seq.id, seq.header, s[sn], self.product_type) 326 aligned_seqs.append(sequence) 327 328 return A3MAlignment(aligned_seqs, strict=self.strict)
329
330 -class StructureAlignmentFactory(object):
331 """ 332 Protein structure alignment parser. 333 334 In order to construct the structural alignment, this factory needs a PDB 335 structure provider: an object, whose C{provider.get} method returns a 336 L{csb.bio.structute.Structure} for a given sequence identifier. Sequence 337 identifiers on the other hand need to be split into 'accession number' 338 and 'chain ID'. By default this is done using a standard PDB Entry ID 339 factory, but clients are free to provide custom factories. An C{id_factory} 340 must be a callable, which accepts a single string identifier and returns 341 an EntryID object. 342 343 @param provider: data source for all structures found in the alignment 344 @type provider: L{csb.bio.io.wwpdb.StructureProvider} 345 @param id_factory: callable factory, which transforms a sequence ID into 346 a L{csb.bio.io.wwpdb.EntryID} object. By default 347 this is L{csb.bio.io.wwpdb.EntryID.create}. 348 @type id_factory: callable 349 @param strict: if True, raise exception on duplicate sequence identifiers. 350 See L{csb.bio.sequence.AbstractAlignment} for details 351 @type strict: bool 352 """ 353
354 - def __init__(self, provider, id_factory=None, strict=True):
355 356 from csb.bio.io.wwpdb import EntryID 357 358 if id_factory is None: 359 id_factory = EntryID.create 360 if not hasattr(id_factory, '__call__'): 361 raise TypeError(id_factory) 362 363 if not hasattr(provider, 'get'): 364 raise TypeError(provider) 365 366 self._type = SequenceTypes.Protein 367 self._strict = bool(strict) 368 self._provider = provider 369 self._id_factory = id_factory
370 371 @property
372 - def product_type(self):
373 """ 374 Default sequence type of the alignment rows - a member of L{SequenceTypes} 375 @rtype: enum item 376 """ 377 return self._type
378 379 @property
380 - def provider(self):
381 """ 382 Current L{csb.bio.io.wwpdb.StructureProvider} instance in use 383 @rtype: L{StructureProvider} 384 """ 385 return self._provider
386 387 @property
388 - def id_factory(self):
389 """ 390 Current L{csb.bio.io.wwpdb.EntryID} factory instance in use 391 @rtype: L{EntryID} 392 """ 393 return self._id_factory
394 395 @property
396 - def strict(self):
397 """ 398 True if strict mode is enabled 399 @rtype: bool 400 """ 401 return self._strict
402
403 - def make_alignment(self, string):
404 """ 405 Build a protein structure alignment given a multi-FASTA string 406 and the current structure C{provider}. 407 408 @param string: alignment string 409 @type string: str 410 411 @rtype: L{SequenceAlignment} 412 413 @raise SequenceError: when an aligned sequence is not a proper 414 subsequence of its respective source PDB chain 415 @raise StructureNotFoundError: if C{provider} can't provide a structure 416 for a given sequence ID 417 @raise InvalidEntryIDError: if a given sequence ID cannot be parsed 418 """ 419 420 entries = [] 421 parser = SequenceParser(Sequence, self.product_type) 422 sequences = parser.parse_string(string) 423 424 for row in sequences: 425 id = self.id_factory(row.id) 426 chain = self.provider.get(id.accession).chains[id.chain] 427 428 entry = self.make_entry(row, chain) 429 entries.append(entry) 430 431 return StructureAlignment(entries, strict=self.strict)
432
433 - def make_entry(self, row, chain):
434 """ 435 Build a protein structure alignment entry, given a sequence alignment 436 entry and its corresponding source PDB chain. 437 438 @param row: sequence alignment entry (sequence with gaps) 439 @type row: L{AbstractSequence}, L{SequenceAdapter} 440 @param chain: source PDB chain 441 @type chain: L{csb.bio.structure.Chain} 442 443 @return: gapped chain sequence, containing cloned residues from the 444 source chain (except for the gaps) 445 @rtype: L{ChainSequence} 446 @raise SequenceError: when C{row} is not a proper subsequence of C{chain} 447 """ 448 offset = 1 449 residues = [] 450 sequence = row.strip().sequence.upper() 451 452 try: 453 start = chain.sequence.index(sequence) + 1 454 except ValueError: 455 raise SequenceError('{0}: not a subsequence of {1}'.format(row.id, chain.entry_id)) 456 457 for rinfo in row.residues: 458 459 if rinfo.type == row.alphabet.GAP: 460 residues.append(rinfo) 461 continue 462 else: 463 rank = start + offset - 1 464 assert chain.residues[rank].type == rinfo.type 465 residues.append(chain.residues[rank].clone()) 466 offset += 1 467 continue 468 469 return ChainSequence(row.id, row.header, residues, chain.type)
470
471 472 -class OutputBuilder(object):
473 """ 474 Base sequence/alignment string format builder. 475 476 @param output: destination stream, where the product is written. 477 @type output: file 478 @param headers: if False, omit headers 479 @type headers: bool 480 481 @note: File builders are not guaranteed to check the correctness of the 482 product. It is assumed that the client of the builder knows what 483 it is doing. 484 """ 485 __metaclass__ = ABCMeta 486 _registry = {} 487
488 - def __init__(self, output, headers=True):
489 490 if not hasattr(output, 'write'): 491 raise TypeError(output) 492 493 self._out = output 494 self._headers = bool(headers)
495 496 @staticmethod
497 - def create(format, *a, **k):
498 """ 499 Create an output builder, which knows how to handle the specified 500 sequence/alignment C{format}. Additional arguments are passed to the 501 builder's constructor. 502 503 @param format: L{AlignmentFormats} member 504 @type format: L{EnumItem} 505 @rtype: L{OutputBuilder} 506 """ 507 if format not in OutputBuilder._registry: 508 raise ValueError('Unhandled format: {0}'.format(format)) 509 510 klass = OutputBuilder._registry[format] 511 return klass(*a, **k)
512 513 @staticmethod
514 - def register(format, klass):
515 """ 516 Register a new output builder. 517 518 @param format: L{AlignmentFormats} member 519 @type format: L{EnumItem} 520 @param klass: builder class (L{OutputBuilder} sub-class) 521 @type klass: type 522 """ 523 assert format not in OutputBuilder._registry 524 assert issubclass(klass, OutputBuilder) 525 526 OutputBuilder._registry[format] = klass
527 528 @property
529 - def output(self):
530 """ 531 Destination stream 532 @rtype: stream 533 """ 534 return self._out
535 536 @property
537 - def headers(self):
538 """ 539 True if sequence headers will be written to the destination 540 @rtype: bool 541 """ 542 return self._headers
543
544 - def write(self, text):
545 """ 546 Write a chunk of C{text} to the output stream. 547 """ 548 self._out.write(text)
549
550 - def writeline(self, text):
551 """ 552 Write a chunk of C{text}, followed by a newline terminator. 553 """ 554 self._out.write(text) 555 self._out.write('\n')
556 557 @abstractmethod
558 - def add_sequence(self, sequence):
559 """ 560 Format and append a new sequence to the product. 561 @type sequence: L{AbstractSequence} 562 """ 563 pass
564
565 - def add_many(self, sequences):
566 """ 567 Format and append a collection of L{AbstractSequence}s to the product. 568 @type sequences: iterable of L{AbstractSequence}s 569 """ 570 for s in sequences: 571 self.add_sequence(s)
572 573 @abstractmethod
574 - def add_alignment(self, alignment):
575 """ 576 Format and append an alignment to the product. 577 @type alignment: L{AbstractAlignment} 578 """ 579 pass
580
581 - def add_separator(self, separator=''):
582 """ 583 Append a sequence separator to the product. 584 """ 585 self.writeline(separator)
586
587 - def add_comment(self, text, comment='#', length=120):
588 """ 589 Append a comment line to the product. 590 591 @param text: comment text 592 @type text: str 593 @param comment: comment prefix 594 @type comment: str 595 @param length: maximal comment length 596 @type length: int 597 """ 598 for i in range(0, len(text), length): 599 self.write(comment) 600 self.write(' ') 601 self.write(text[i : i + length]) 602 self.write('\n')
603
604 -class FASTAOutputBuilder(OutputBuilder):
605 """ 606 Formats sequences as standard FASTA strings. See L{OutputBuilder}. 607 """ 608 FORMAT = AlignmentFormats.FASTA 609
610 - def add_sequence(self, sequence):
611 612 if self.headers: 613 self.write(AbstractSequence.DELIMITER) 614 self.writeline(sequence.header) 615 616 insertion = str(sequence.alphabet.INSERTION) 617 gap = str(sequence.alphabet.GAP) 618 619 self.writeline(sequence.sequence.replace(insertion, gap))
620
621 - def add_alignment(self, alignment):
622 623 self.add_many(alignment.rows)
624
625 -class A3MOutputBuilder(OutputBuilder):
626 """ 627 Formats sequences as A3M strings. When appending an alignment, this builder 628 will write all insertion-containing columns in lower case. Also, gap symbols 629 are omitted if the respective columns contain insertions. 630 631 See L{OutputBuilder}. 632 """ 633 FORMAT = AlignmentFormats.A3M 634
635 - def add_sequence(self, sequence):
636 637 if self.headers: 638 self.write(AbstractSequence.DELIMITER) 639 self.writeline(sequence.header) 640 641 self.writeline(sequence.sequence)
642
643 - def add_alignment(self, alignment):
644 645 if isinstance(alignment, A3MAlignment): 646 self._add_a3m(alignment) 647 else: 648 self._add_proper(alignment)
649
650 - def _add_a3m(self, alignment):
651 652 for s in alignment.rows: 653 654 if self.headers: 655 self.write(AbstractSequence.DELIMITER) 656 self.writeline(s.header) 657 658 sequence = [] 659 660 for ci in s.columns: 661 662 if ci.residue.type != s.alphabet.INSERTION: 663 char = str(ci.residue.type) 664 665 if alignment.insertion_at(ci.column): 666 sequence.append(char.lower()) 667 else: 668 sequence.append(char) 669 else: 670 continue 671 672 self.writeline(''.join(sequence))
673
674 - def _add_proper(self, alignment):
675 676 for s in alignment.rows: 677 678 if self.headers: 679 self.write(AbstractSequence.DELIMITER) 680 self.writeline(s.header) 681 682 master = alignment.rows[1] 683 sequence = [] 684 685 for ci in s.columns: 686 char = str(ci.residue.type) 687 688 if master.columns[ci.column].residue.type == master.alphabet.GAP: 689 if ci.residue.type == s.alphabet.GAP: 690 continue 691 else: 692 sequence.append(char.lower()) 693 else: 694 sequence.append(char) 695 696 self.writeline(''.join(sequence))
697
698 -class PIROutputBuilder(OutputBuilder):
699 """ 700 Formats sequences as PIR FASTA strings, recognized by Modeller. 701 See L{OutputBuilder} for general alignment documentation. 702 """ 703 FORMAT = AlignmentFormats.PIR 704
705 - def add_sequence(self, sequence):
706 self._add(sequence, template=True)
707
708 - def add_alignment(self, alignment):
709 710 for n, sequence in enumerate(alignment.rows, start=1): 711 if n == 1: 712 self.add_target(sequence) 713 else: 714 self.add_template(sequence)
715 716
717 - def add_target(self, sequence):
718 self._add(sequence, template=False)
719
720 - def add_template(self, sequence):
721 self._add(sequence, template=True)
722
723 - def _add(self, sequence, template=True):
724 725 if self.headers: 726 727 if template: 728 type = 'structure' 729 else: 730 type = 'sequence' 731 732 id = sequence.id 733 start = end = '.' 734 735 if hasattr(sequence, 'start') and hasattr(sequence, 'end'): 736 start = sequence.start 737 end = sequence.end 738 739 header = 'P1;{0}\n{2}:{0}:{3}:{1}:{4}:{1}::::'.format(id[:-1], id[-1], type, start, end) 740 741 self.write(AbstractSequence.DELIMITER) 742 self.writeline(header) 743 744 insertion = str(sequence.alphabet.INSERTION) 745 gap = str(sequence.alphabet.GAP) 746 747 self.write(sequence.sequence.replace(insertion, gap)) 748 self.writeline('*')
749 750 751 # register builders 752 for klass in OutputBuilder.__subclasses__(): 753 OutputBuilder.register(klass.FORMAT, klass) 754