1 """
2 PDB structure parsers, format builders and database providers.
3
4 The most basic usage is:
5
6 >>> parser = StructureParser('structure.pdb')
7 >>> parser.parse_structure()
8 <Structure> # a Structure object (model)
9
10 or if this is an NMR ensemble:
11
12 >>> parser.parse_models()
13 <Ensemble> # an Ensemble object (collection of alternative Structure-s)
14
15 This module introduces a family of PDB file parsers. The common interface of all
16 parsers is defined in L{AbstractStructureParser}. This class has several
17 implementations:
18
19 - L{RegularStructureParser} - handles normal PDB files with SEQRES fields
20 - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files,
21 which are lacking SEQRES records (initializes all residues from the ATOMs instead)
22 - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures
23 without coordinates. Useful for reading metadata (e.g. accession numbers or just
24 plain SEQRES sequences) with minimum overhead
25
26 Unless you have a special reason, you should use the L{StructureParser} factory,
27 which returns a proper L{AbstractStructureParser} implementation, depending on the
28 input PDB file. If the input file looks like a regular PDB file, the factory
29 returns a L{RegularStructureParser}, otherwise it instantiates L{RegularStructureParser}.
30 L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}.
31
32 Another important abstraction in this module is L{StructureProvider}. It has several
33 implementations which can be used to retrieve PDB L{Structure}s from various sources:
34 file system directories, remote URLs, etc. You can easily create your own provider as
35 well. See L{StructureProvider} for details.
36
37 Finally, this module gives you some L{FileBuilder}s, used for text serialization
38 of L{Structure}s and L{Ensemble}s:
39
40 >>> builder = PDBFileBuilder(stream)
41 >>> builder.add_header(structure)
42 >>> builder.add_structure(structure)
43
44 where stream is any Python stream, e.g. an open file or sys.stdout.
45
46 See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these
47 objects.
48 """
49
50 import re
51 import os
52 import numpy
53 import datetime
54 import multiprocessing
55
56 import csb.bio.structure
57 import csb.core
58 import csb.io
59
60 from abc import ABCMeta, abstractmethod
61 from csb.bio.sequence import SequenceTypes, SequenceAlphabets
62 from csb.bio.structure import ChemElements, SecStructures
63
64
65 PDB_AMINOACIDS = {
66 'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN',
67 'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY',
68 'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS',
69 'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS',
70 'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER',
71 'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN',
72 'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU',
73 'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER',
74 'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG',
75 'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU',
76 'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER',
77 'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU',
78 'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR',
79 'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA',
80 'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET',
81 'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY',
82 'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE',
83 'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS',
84 'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR',
85 '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL',
86 'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS',
87 'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS',
88 'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP',
89 'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR',
90 'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU',
91 'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS',
92 'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY',
93 'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS',
94 'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS',
95 'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS',
96 'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS',
97 'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG',
98 'DNP': 'ALA'
99 }
100 """
101 Dictionary of non-standard amino acids, which could be found in PDB.
102 """
103
104
105 PDB_NUCLEOTIDES = {
106 'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine',
107 'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine',
108 'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine',
109 'K': 'Ketone', ' M': 'Amino', 'S': 'Strong', 'W': 'Weak',
110 'B': 'NotA', 'D' : 'NotC', 'H': 'NotG', 'V': 'NotT',
111 'N': 'Any', 'X' : 'Masked'
112 }
113 """
114 Dictionary of non-standard nucleotides, which could be found in PDB.
115 """
119
122
125
128
131
134
135 -class InvalidEntryIDError(StructureFormatError):
137
138
139 -class EntryID(object):
140 """
141 Represents a PDB Chain identifier. Implementing classes must define
142 how the original ID is split into accession number and chain ID.
143
144 @param id: identifier
145 @type id: str
146 """
147 __metaclass__ = ABCMeta
148
149 - def __init__(self, id):
150
151 self._accession = ''
152 self._chain = ''
153
154 id = id.strip()
155 self._accession, self._chain = self.parse(id)
156
157 @staticmethod
159 """
160 Guess the format of C{id} and parse it.
161
162 @return: a new PDB ID of the appropriate type
163 @rtype: L{EntryID}
164 """
165
166 if len(id) in (4, 5):
167 return StandardID(id)
168 elif len(id) == 6 and id[4] == '_':
169 return SeqResID(id)
170 else:
171 return DegenerateID(id)
172
173 @abstractmethod
174 - def parse(self, id):
175 """
176 Split C{id} into accession number and chain ID.
177
178 @param id: PDB identifier
179 @type id: str
180 @return: (accession, chain)
181 @rtype: tuple of str
182
183 @raise InvalidEntryIDError: when C{id} is in an inappropriate format
184 """
185 pass
186
188 """
189 @return: the identifier in its original format
190 @rtype: str
191 """
192 return self.entry_id
193
194 @property
195 - def accession(self):
196 """
197 Accession number part of the Entry ID
198 @rtype: str
199 """
200 return self._accession
201
202 @property
204 """
205 Chain ID part of the Entry ID
206 @rtype: str
207 """
208 return self._chain
209
210 @property
211 - def entry_id(self):
212 """
213 Accession number + Chain ID
214 @rtype: str
215 """
216 return "{0.accession}{0.chain}".format(self)
217
220
222 """
223 Standard PDB ID in the following form: xxxxY, where xxxx is the accession
224 number (lower case) and Y is an optional chain identifier.
225 """
232
234 """
235 Looks like a L{StandardID}, except that the accession number may have
236 arbitrary length.
237 """
244
246 """
247 Same as a L{StandardID}, but contains an additional underscore between
248 te accession number and the chain identifier.
249 """
256
259
262 """
263 A base PDB structure format-aware parser. Subclasses must implement the
264 internal abstract method C{_parse_header} in order to complete the
265 implementation.
266
267 @param structure_file: the input PD file to parse
268 @type structure_file: str
269 @param check_ss: if True, secondary structure errors in the file will cause
270 L{SecStructureFormatError} exceptions
271 @type check_ss: bool
272
273 @raise IOError: when the input file cannot be found
274 """
275
276 __metaclass__ = ABCMeta
277
278 @staticmethod
280 """
281 A StructureParser factory, which instantiates and returns the proper parser
282 object based on the contents of the PDB file.
283
284 If the file contains a SEQRES section, L{RegularStructureParser} is returned,
285 otherwise L{LegacyStructureParser} is instantiated. In the latter case
286 LegacyStructureParser will read the sequence data directly from the ATOMs.
287
288 @param structure_file: the PDB file to parse
289 @type structure_file: str
290
291 @rtype: L{AbstractStructureParser}
292 """
293 has_seqres = False
294
295 for line in open(structure_file):
296 if line.startswith('SEQRES'):
297 has_seqres = True
298 break
299
300 if has_seqres:
301 return RegularStructureParser(structure_file)
302 else:
303 return LegacyStructureParser(structure_file)
304
305 - def __init__(self, structure_file, check_ss=False):
306
307 self._file = None
308 self._stream = None
309 self._check_ss = bool(check_ss)
310
311 self.filename = structure_file
312
314 try:
315 self._stream.close()
316 except:
317 pass
318
319 @property
321 """
322 Current input PDB file name
323 @rtype: str
324 """
325 return self._file
326 @filename.setter
328 try:
329 stream = open(name)
330 except IOError:
331 raise IOError('File not found: {0}'.format(name))
332
333 if self._stream:
334 try:
335 self._stream.close()
336 except:
337 pass
338 self._stream = stream
339 self._file = name
340
342 """
343 Find all available model identifiers in the structure.
344
345 @return: a list of model IDs
346 @rtype: list
347 """
348 models = []
349 check = set()
350
351 with open(self._file, 'r') as f:
352 for line in f:
353 if line.startswith('MODEL'):
354 model_id = int(line[10:14])
355 if model_id in check:
356 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id))
357 models.append(model_id)
358 check.add(model_id)
359
360 if len(models) > 0:
361 if not(min(check) == 1 and max(check) == len(models)):
362 raise StructureFormatError('Non-consecutive model identifier(s) encountered')
363 return models
364 else:
365 return []
366
368 """
369 Try to guess what is the sequence type of a PDB C{residue_name}.
370
371 @param residue_name: a PDB-conforming name of a residue
372 @type residue_name: str
373
374 @return: a L{SequenceTypes} enum member
375 @rtype: L{csb.core.EnumItem}
376
377 @raise UnknownPDBResidueError: when there is no such PDB residue name
378 in the catalog tables
379 """
380 if residue_name in PDB_AMINOACIDS:
381 return SequenceTypes.Protein
382 elif residue_name in PDB_NUCLEOTIDES:
383 return SequenceTypes.NucleicAcid
384 else:
385 raise UnknownPDBResidueError(residue_name)
386
388 """
389 Try to parse a PDB C{residue_name} and return its closest 'normal'
390 string representation. If a sequence type (C{as_type}) is defined,
391 guess the alphabet based on that information, otherwise try first to
392 parse it as a protein residue.
393
394 @param residue_name: a PDB-conforming name of a residue
395 @type residue_name: str
396 @param as_type: suggest a sequence type (L{SequenceTypes} member)
397 @type L{scb.core.EnumItem}
398
399 @return: a normalized residue name
400 @rtype: str
401
402 @raise UnknownPDBResidueError: when there is no such PDB residue name
403 in the catalog table(s)
404 """
405 if as_type is None:
406 as_type = self.guess_sequence_type(residue_name)
407
408 try:
409 if as_type == SequenceTypes.Protein:
410 return PDB_AMINOACIDS[residue_name]
411 elif as_type == SequenceTypes.NucleicAcid:
412 return PDB_NUCLEOTIDES[residue_name]
413 else:
414 raise UnknownPDBResidueError(residue_name)
415 except KeyError:
416 raise UnknownPDBResidueError(residue_name)
417
440
441 - def parse(self, filename=None, model=None):
446
448 """
449 Parse and return the L{Structure} with the specified model identifier.
450 If no explicit model is specified, parse the first model in the
451 structure.
452
453 @param model: parse exactly the model with this ID
454 @type model: str
455
456 @return: object representation of the selected model
457 @rtype: L{Structure}
458
459 @raise ValueError: When an invalid model ID is specified
460 @raise PDBParseError: When the input PDB file suffers from unrecoverable
461 corruption. More specialized exceptions will be
462 raised depending on the context (see L{PDBParseError}'s
463 subclasses).
464 """
465 if model is not None:
466 model = int(model)
467
468 structure = self._parse_header(model)
469
470 self._parse_atoms(structure, model)
471 self._parse_ss(structure)
472
473 return structure
474
504
506 """
507 Parse and return the L{Structure} of the biological unit (quaternary
508 structure) as annotated by the REMARK 350 BIOMOLECULE record.
509
510 @param number: biomolecule number
511 @type number: int
512
513 @param single: if True, assign new single-letter chain
514 identifiers. If False, assign multi-letter chain identifiers whith a
515 number appended to the original identifier, like "A1", "A2", ...
516 @type single: bool
517
518 @return: structure of biological unit
519 @rtype: L{Structure}
520 """
521 remarks = self._parse_remarks()
522 if 350 not in remarks:
523 raise PDBParseError('There is no REMARK 350')
524
525 current = 1
526 biomt = {current: {}}
527 chains = tuple()
528
529 def split(line):
530 return [c.strip() for c in line.split(',') if c.strip() != '']
531
532 for line in remarks[350]:
533 if line.startswith('BIOMOLECULE:'):
534 current = int(line[12:])
535 biomt[current] = {}
536 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'):
537 chains = tuple(split(line[30:]))
538 elif line.startswith(' AND CHAINS:'):
539 chains += tuple(split(line[30:]))
540 elif line.startswith(' BIOMT'):
541 num = int(line[8:12])
542 vec = line[12:].split()
543 vec = list(map(float, vec))
544 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec)
545
546 if number not in biomt or len(biomt[number]) == 0:
547 raise KeyError('no BIOMOLECULE number {0}'.format(number))
548
549 asu = self.parse_structure()
550 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number))
551
552 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789')
553
554 for chains, matrices in biomt[number].items():
555 for num in matrices:
556 mat = numpy.array(matrices[num][0:12]).reshape((3,4))
557 R, t = mat[:3,:3], mat[:3,3]
558
559 for chain in chains:
560 if chain not in asu:
561 raise PDBParseError('chain {0} missing'.format(chain))
562 copy = asu[chain].clone()
563 copy.transform(R, t)
564 if single:
565 if len(structure.chains) == 62:
566 raise ValueError('too many chains for single=True')
567 copy.id = next(newchainiditer)
568 else:
569 copy.id = '{0}{1}'.format(chain, num)
570 structure.chains.append(copy)
571
572 return structure
573
574 @abstractmethod
576 """
577 An abstract method, which subclasses must use to customize the way the
578 PDB header (or the absence of a such) is handled. The implementation
579 must rely on reading character data from the current internal
580 self._stream and must return a new L{csb.bio.structure.Structure}
581 instance with properly initialized header data: accession, model,
582 molecule identifiers, chains and residues. This structure object is
583 then internally passed to the C{_parse_atoms} hook, responsible for
584 attachment of the atoms to the residues in the structure.
585 """
586 pass
587
603
605 """
606 Parse the ATOMs from the specified C{model} and attach them to the
607 C{structure}.
608 """
609
610 structure.model_id = None
611
612 atoms = dict( (chain, []) for chain in structure.chains )
613 chains = set()
614 total_chains = len([c for c in structure.items if c.length > 0])
615 het_residues = dict( (chain, set()) for chain in structure.chains )
616 in_ligands = False
617 in_atom = False
618
619 self._stream.seek(0)
620 while True:
621
622 try:
623 line = next(self._stream)
624 except StopIteration:
625 break
626
627 if line.startswith('HET '):
628 het_residue, het_chain, het_residue_id = line[7:10].strip(), line[12], line[13:18].strip()
629
630 if het_chain in structure:
631 chain = structure.chains[het_chain]
632 if chain.type == SequenceTypes.Protein and het_residue in PDB_AMINOACIDS:
633 het_residues[het_chain].add(het_residue_id)
634 elif chain.type == SequenceTypes.NucleicAcid and het_residue in PDB_NUCLEOTIDES:
635 het_residues[het_chain].add(het_residue_id)
636
637 elif line.startswith('MODEL'):
638 if model and model != int(line[10:14]):
639 self._scroll_model(model, self._stream)
640 structure.model_id = model
641 else:
642 model = int(line[10:14])
643 structure.model_id = model
644
645 elif line.startswith('ATOM') \
646 or (line.startswith('HETATM') and not in_ligands):
647 in_atom = True
648
649 rank = int(line[22:26])
650 serial_number = int(line[6:11])
651 name = line[12:16]
652 x, y, z = line[30:38], line[38:46], line[46:54]
653 vector = numpy.array([float(x), float(y), float(z)])
654 element = line[76:78].strip()
655 if element:
656 try:
657 element = csb.core.Enum.parsename(ChemElements, element)
658 except csb.core.EnumMemberError:
659 if element in ('D', 'X'):
660 element = ChemElements.x
661 else:
662 raise StructureFormatError('Unknown chemical element: {0}'.format(element))
663 else:
664 element = None
665
666 atom = csb.bio.structure.Atom(serial_number, name, element,
667 vector)
668
669 atom._het = line.startswith('HETATM')
670 atom._rank = rank
671 atom._sequence_number = int(line[22:26].strip())
672 atom._residue_id = str(atom._sequence_number)
673 atom._insertion_code = line[26].strip()
674 if not atom._insertion_code:
675 atom._insertion_code = None
676 else:
677 atom._residue_id += atom._insertion_code
678
679 atom.alternate = line[16].strip()
680 if not atom.alternate:
681 atom.alternate = None
682
683 atom._chain = line[21].strip()
684 if atom._chain not in structure.chains:
685 raise StructureFormatError(
686 'Atom {0}: chain {1} is undefined'.format(serial_number, atom._chain))
687 chains.add(atom._chain)
688 residue_name = line[17:20].strip()
689 residue_name = self.parse_residue_safe(residue_name, as_type=structure.chains[atom._chain].type)
690 if structure.chains[atom._chain].type == SequenceTypes.NucleicAcid:
691 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Nucleic, residue_name)
692 else:
693 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Protein, residue_name)
694
695 atom.occupancy = float(line[54:60])
696 atom.bfactor = float(line[60:66])
697
698 charge = line[78:80].strip()
699 if charge:
700 if charge in ('+', '-'):
701 charge += '1'
702 if charge[-1] in ('+', '-'):
703 charge = charge[::-1]
704 atom.charge = int(charge)
705
706 atoms[atom._chain].append(atom)
707
708 elif in_atom and line.startswith('TER'):
709 in_atom = False
710 if len(chains) == total_chains:
711 in_ligands = True
712
713 elif line.startswith('ENDMDL'):
714 break
715
716 elif line.startswith('END'):
717 break
718
719 if structure.model_id != model:
720 raise ValueError('No such model {0} in the structure.'.format(model))
721
722 self._map_residues(structure, atoms, het_residues)
723
725
726 assert set(structure.chains) == set(atoms.keys())
727
728 for chain in structure.chains:
729
730 subject = structure.chains[chain].sequence
731 query = ''
732 fragments = []
733
734 seq_numbers = []
735 lookup = {}
736
737 i = -1
738 for a in atoms[chain]:
739 if a._residue_id not in lookup:
740 i += 1
741 lookup[a._residue_id] = [a._sequence_number, a._insertion_code]
742 seq_numbers.append(a._residue_id)
743 res_name = a._residue_name.value
744 res_id = '{0}{1}'.format(a._sequence_number or '', a._insertion_code or '').strip()
745 if a._het and res_id not in het_residues[chain]:
746
747 fragments.append([res_name, '?'])
748 elif a._insertion_code and not (i > 0 and lookup[seq_numbers[i - 1]][1]):
749 fragments.append([res_name])
750 elif i == 0 or a._sequence_number - lookup[seq_numbers[i - 1]][0] not in (0, 1, -1):
751
752 fragments.append([res_name])
753 else:
754
755 if fragments[-1][-1] == '?':
756
757
758 fragments.append([res_name])
759 else:
760
761 fragments[-1].append(res_name)
762
763 for i, frag in enumerate(fragments):
764 fragments[i] = ''.join(frag)
765 if len(fragments) > 100:
766
767 raise StructureFormatError("Can't map structure with more than 100 fragments in ATOMS")
768 query = '^.*?({0}).*?$'.format(').*?('.join(fragments))
769
770 matches = re.match(query, subject)
771
772 if matches:
773 seq_numitem = -1
774 for frag_number, frag in enumerate(matches.groups(), start=1):
775 if frag is '':
776
777
778 seq_numitem += 1
779 else:
780 for i, dummy in enumerate(frag, start=1):
781 seq_numitem += 1
782
783 try:
784 lookup[ seq_numbers[seq_numitem] ] = matches.start(frag_number) + i
785 except:
786 raise
787
788 fixed_residue = None
789 for atom in atoms[chain]:
790 if not isinstance(lookup[atom._residue_id], int):
791 continue
792 atom._rank = lookup[atom._residue_id]
793 residue = structure.chains[chain].residues[atom._rank]
794 if residue is not fixed_residue:
795 residue.id = atom._sequence_number, atom._insertion_code
796 fixed_residue = residue
797
798 assert str(residue.type) == subject[atom._rank - 1]
799 residue.atoms.append(atom)
800
801 del atom._rank
802 del atom._insertion_code
803 del atom._sequence_number
804 del atom._chain
805 del atom._residue_id
806 del atom._residue_name
807 else:
808 if structure.chains[chain].length == 0 and len(atoms[chain]) > 0:
809 raise StructureFormatError("Can't add atoms: chain {0} has no residues".format(chain))
810 else:
811 raise StructureFormatError("Can't map atoms")
812
814 """
815 Parse and attach secondary structure data.
816
817 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated
818 as a regular SecStructures.Helix. This is due to incompatibility
819 between DSSP and PDB helix types.
820 @todo: Implement a proper workaround for the previous bug (e.g. skip all
821 helices types not included in the DSSP enum)
822
823 @warning: In this implementation only the start/end positions of the SS
824 elements are parsed. Additional data like H-bonding is ignored.
825
826 @bug: Currently structure.to_pdb() is not writing any SS data.
827 """
828 elements = {}
829 self._stream.seek(0)
830
831 while True:
832 try:
833 line = next(self._stream)
834 except StopIteration:
835 break
836
837 if line.startswith('HELIX'):
838
839 chain = structure.chains[line[19].strip()]
840 if chain.id not in elements:
841 elements[chain.id] = []
842 if chain.id != line[31].strip():
843 if self._check_ss:
844 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10]))
845 else:
846 continue
847 try:
848 startres = chain.find(line[21:25].strip(), line[25].strip())
849 endres = chain.find(line[33:37].strip(), line[37].strip())
850 except csb.core.ItemNotFoundError as ex:
851 if self._check_ss:
852 raise SecStructureFormatError(
853 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
854 else:
855 continue
856 if not startres.rank <= endres.rank:
857 if self._check_ss:
858 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10]))
859 else:
860 continue
861 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix)
862 elements[chain.id].append(helix)
863
864 if line.startswith('SHEET'):
865
866 chain = structure.chains[line[21].strip()]
867 if chain.id not in elements:
868 elements[chain.id] = []
869 if chain.id != line[32].strip():
870 if self._check_ss:
871 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10]))
872 else:
873 continue
874 try:
875 startres = chain.find(line[22:26].strip(), line[26].strip())
876 endres = chain.find(line[33:37].strip(), line[37].strip())
877 except csb.core.ItemNotFoundError as ex:
878 if self._check_ss:
879 raise SecStructureFormatError(
880 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
881 else:
882 continue
883 if not startres.rank <= endres.rank:
884 if self._check_ss:
885 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10]))
886 else:
887 continue
888 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand)
889 elements[chain.id].append(strand)
890
891 elif line.startswith('MODEL') or line.startswith('ATOM'):
892 break
893
894 for chain_id in elements:
895 ss = csb.bio.structure.SecondaryStructure()
896 for e in elements[chain_id]:
897 ss.append(e)
898 structure.chains[chain_id].secondary_structure = ss
899
901 """
902 Read REMARK lines from PDB file.
903
904 @return: dictionary with remark numbers as keys, and lists of lines as values.
905 @rtype: dict
906 """
907 self._stream.seek(0)
908
909 remarks = {}
910
911 for line in self._stream:
912 if line.startswith('REMARK'):
913 num = int(line[7:10])
914 lstring = line[11:]
915 remarks.setdefault(num, []).append(lstring)
916 elif line.startswith('DBREF') or line.startswith('ATOM'):
917 break
918
919 return remarks
920
923 """
924 This is the de facto PDB parser, which is designed to read SEQRES and ATOM
925 sections separately, and them map them. Intentionally fails to parse
926 malformed PDB files, e.g. a PDB file without a HEADER section.
927 """
928
930 """
931 Parse the HEADER section of a regular PDB file.
932
933 @return: a L{csb.bio.structure.Structure} instance with properly
934 initialized residues from the SEQRES.
935 @rtype: L{csb.bio.structure.Structure}
936
937 @raise PDBParseError: if the stream has no HEADER at byte 0
938 """
939
940 self._stream.seek(0)
941
942 header = next(self._stream)
943 if not header.startswith('HEADER'):
944 raise PDBParseError('Does not look like a regular PDB file.')
945
946 structure = csb.bio.structure.Structure(header.split()[-1])
947
948 while True:
949
950 try:
951 line = next(self._stream)
952 except StopIteration:
953 break
954
955 if line.startswith('COMPND'):
956 if line[10:].lstrip().startswith('MOL_ID:'):
957 mol_id = int(line[18:].replace(';', '').strip())
958 chain_name = ''
959 chains = ''
960 while line.startswith('COMPND'):
961 line = next(self._stream)
962 if line.split()[2].startswith('MOLECULE:'):
963 chain_name += line[20:].strip()
964 while not chain_name.endswith(';'):
965 line = next(self._stream)
966 if not line.startswith('COMPND'):
967 break
968 chain_name += ' ' + line[11:].strip()
969 else:
970 while not line.split()[2].startswith('CHAIN:'):
971 line = next(self._stream)
972 if not line.startswith('COMPND'):
973 raise HeaderFormatError('Missing chain identifier in COMPND section')
974 chains = line[17:].strip()
975 while not chains.endswith(';'):
976 line = next(self._stream)
977 if not line.startswith('COMPND'):
978 break
979 chains += ', ' + line[11:].strip()
980 break
981
982 chain_name = chain_name.strip()[:-1]
983 for chain in chains.replace(';', ' ').replace(',', ' ').split() or ['']:
984 new_chain = csb.bio.structure.Chain(chain, type=SequenceTypes.Unknown,
985 name=chain_name, accession=structure.accession)
986 new_chain.molecule_id = mol_id
987 try:
988 structure.chains.append(new_chain)
989 except csb.core.DuplicateKeyError:
990 raise HeaderFormatError('Chain {0} is already defined.'.format(new_chain.id))
991
992 elif line.startswith('SEQRES'):
993
994 seq_fields = [line[7:10], line[11], line[13:17] ]
995 seq_fields.extend(line[18:].split())
996
997 rank_base = int(seq_fields[0])
998 chain_id = seq_fields[1].strip()
999
1000 if chain_id not in structure.chains:
1001 raise HeaderFormatError('Chain {0} is undefined'.format(chain_id))
1002
1003 chain = structure.chains[chain_id]
1004
1005 if chain.type == SequenceTypes.Unknown:
1006 inner_residuerank = int(len(seq_fields[3:]) / 2) + 3
1007 for i in [inner_residuerank, 3, -1]:
1008 try:
1009 chain.type = self.guess_sequence_type(seq_fields[i])
1010 break
1011 except UnknownPDBResidueError:
1012 pass
1013
1014 for i, residue_name in enumerate(seq_fields[3:]):
1015 rank = rank_base * 13 - (13 - (i + 1))
1016 rname = self.parse_residue_safe(residue_name, as_type=chain.type)
1017 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname)
1018 residue._pdb_name = residue_name
1019 structure.chains[chain_id].residues.append(residue)
1020 assert structure.chains[chain_id].residues.last_index == rank
1021
1022 elif line.startswith('MODEL') or line.startswith('ATOM'):
1023 break
1024
1025 return structure
1026
1029 """
1030 Ultra fast PDB HEADER parser. Does not read any structural data.
1031 """
1032
1035
1038
1041
1044 """
1045 This is a customized PDB parser, which is designed to read both sequence and
1046 atom data from the ATOM section. This is especially useful when parsing PDB
1047 files without a header.
1048 """
1049
1144
1145
1146 StructureParser = AbstractStructureParser.create_parser
1147 """
1148 Alias for L{AbstractStructureParser.create_parser}.
1149 """
1152 """
1153 Base abstract files for all structure file formatters.
1154 Defines a common step-wise interface according to the Builder pattern.
1155
1156 @param output: output stream (this is where the product is constructed)
1157 @type output: stream
1158 """
1159
1160 __metaclass__ = ABCMeta
1161
1163
1164 if not hasattr(output, 'write'):
1165 raise TypeError(output)
1166
1167 def isnull(this, that, null=None):
1168 if this is null:
1169 return that
1170 else:
1171 return this
1172
1173 self._out = output
1174 self._isnull = isnull
1175
1176 @property
1178 """
1179 Destination stream
1180 @rtype: stream
1181 """
1182 return self._out
1183
1184 @property
1186 """
1187 ISNULL(X, Y) function
1188 @rtype: callable
1189 """
1190 return self._isnull
1191
1193 """
1194 Write a chunk of text
1195 """
1196 self._out.write(text)
1197
1199 """
1200 Write a chunk of text and append a new line terminator
1201 """
1202 self._out.write(text)
1203 self._out.write('\n')
1204
1205 @abstractmethod
1208
1209 @abstractmethod
1212
1215
1217 """
1218 PDB file format builder.
1219 """
1220
1223
1225 """
1226 Write the HEADER of the file using C{master}
1227
1228 @type master: L{Structure}
1229 """
1230
1231 isnull = self.isnull
1232
1233 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}'
1234 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper()))
1235
1236 molecules = { }
1237
1238 for chain_id in master.chains:
1239 chain = master.chains[chain_id]
1240 if chain.molecule_id not in molecules:
1241 molecules[chain.molecule_id] = [ ]
1242 molecules[chain.molecule_id].append(chain_id)
1243
1244 k = 0
1245 for mol_id in sorted(molecules):
1246
1247 chains = molecules[mol_id]
1248 first_chain = master.chains[ chains[0] ]
1249
1250 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0')))
1251 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, '')))
1252 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains)))
1253 k += 3
1254
1255 for chain_id in master.chains:
1256
1257 chain = master.chains[chain_id]
1258 res = [ r._pdb_name for r in chain.residues ]
1259
1260 rn = 0
1261 for j in range(0, chain.length, 13):
1262 rn += 1
1263 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ]
1264 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format(
1265 rn, chain.id, chain.length, ' '.join(residues) ))
1266
1268 """
1269 Append a new model to the file
1270
1271 @type structure: L{Structure}
1272 """
1273
1274 isnull = self.isnull
1275
1276 for chain_id in structure.chains:
1277
1278 chain = structure.chains[chain_id]
1279 for residue in chain.residues:
1280
1281 atoms = [ ]
1282 for an in residue.atoms:
1283 atom = residue.atoms[an]
1284 if isinstance(atom, csb.bio.structure.DisorderedAtom):
1285 for dis_atom in atom: atoms.append(dis_atom)
1286 else:
1287 atoms.append(atom)
1288 atoms.sort()
1289
1290 for atom in atoms:
1291
1292 alt = atom.alternate
1293 if alt is True:
1294 alt = 'A'
1295 elif alt is False:
1296 alt = ' '
1297
1298 if atom.element:
1299 element = repr(atom.element)
1300 else:
1301 element = ' '
1302 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format(
1303 atom.serial_number, atom._full_name, isnull(alt, ' '),
1304 residue._pdb_name, chain.id,
1305 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '),
1306 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.bfactor, 0.0),
1307 element, isnull(atom.charge, ' ') ))
1308
1309 self.writeline('TER')
1310
1312 """
1313 Add the END marker
1314 """
1315 self.writeline('END')
1316 self._out.flush()
1317
1319 """
1320 Supports serialization of NMR ensembles.
1321
1322 Functions as a simple decorator, which wraps C{add_structure} with
1323 MODEL/ENDMDL records.
1324 """
1325
1333
1336 """
1337 Base class for all PDB data source providers.
1338
1339 Concrete classes need to implement the C{find} method, which abstracts the
1340 retrieval of a PDB structure file by a structure identifier. This is a hook
1341 method called internally by C{get}, but subclasses can safely override both
1342 C{find} and {get} to in order to achieve completely custom behavior.
1343 """
1344
1345 __metaclass__ = ABCMeta
1346
1349
1350 @abstractmethod
1351 - def find(self, id):
1352 """
1353 Attempt to discover a PDB file, given a specific PDB C{id}.
1354
1355 @param id: structure identifier (e.g. 1x80)
1356 @type id: str
1357 @return: path and file name on success, None otherwise
1358 @rtype: str or None
1359 """
1360 pass
1361
1362 - def get(self, id, model=None):
1363 """
1364 Discover, parse and return the PDB structure, corresponding to the
1365 specified C{id}.
1366
1367 @param id: structure identifier (e.g. 1x80)
1368 @type id: str
1369 @param model: optional model identifier
1370 @type model: str
1371
1372 @rtype: L{csb.bio.Structure}
1373
1374 @raise StructureNotFoundError: when C{id} could not be found
1375 """
1376 pdb = self.find(id)
1377
1378 if pdb is None:
1379 raise StructureNotFoundError(id)
1380 else:
1381 return StructureParser(pdb).parse_structure(model=model)
1382
1384 """
1385 Simple file system based PDB data source. Scans a list of local directories
1386 using pre-defined file name templates.
1387
1388 @param paths: a list of paths
1389 @type paths: iterable or str
1390 """
1391
1403
1404 @property
1406 """
1407 Current search paths
1408 @rtype: tuple
1409 """
1410 return tuple(self._paths)
1411
1412 @property
1414 """
1415 Current file name match templates
1416 @rtype: tuple
1417 """
1418 return tuple(self._templates)
1419
1420 - def add(self, path):
1421 """
1422 Register a new local C{path}.
1423
1424 @param path: directory name
1425 @type path: str
1426
1427 @raise IOError: if C{path} is not a valid directory
1428 """
1429 if os.path.isdir(path):
1430 self._paths[path] = path
1431 else:
1432 raise IOError(path)
1433
1435 """
1436 Register a custom file name name C{template}. The template must contain
1437 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent
1438
1439 @param template: pattern
1440 @type template: str
1441 """
1442
1443 if '{id}' not in template:
1444 raise ValueError('Template does not contain an "{id}" macro')
1445
1446 if template not in self._templates:
1447 self._templates.append(template)
1448
1450 """
1451 Unregister an existing local C{path}.
1452
1453 @param path: directory name
1454 @type path: str
1455
1456 @raise ValueError: if C{path} had not been registered
1457 """
1458 if path not in self._paths:
1459 raise ValueError('path not found: {0}'.format(path))
1460
1461 del self._paths[path]
1462
1463 - def find(self, id):
1464
1465 for path in self._paths:
1466 for token in self.templates:
1467 fn = os.path.join(path, token.format(id=id))
1468 if os.path.exists(fn):
1469 return fn
1470
1471 return None
1472
1474 """
1475 Retrieves PDB structures from a specified remote URL.
1476 The URL requested from remote server takes the form: <prefix>/<ID><suffix>
1477
1478 @param prefix: URL prefix, including protocol
1479 @type prefix: str
1480 @param suffix: optional URL suffix (.ent by default)
1481 @type suffix: str
1482 """
1483
1484 - def __init__(self, prefix='http://www.rcsb.org/pdb/files/pdb', suffix='.ent'):
1491
1492 @property
1494 """
1495 Current URL prefix
1496 @rtype: str
1497 """
1498 return self._prefix
1499 @prefix.setter
1501 self._prefix = value
1502
1503 @property
1505 """
1506 Current URL suffix
1507 @rtype: str
1508 """
1509 return self._suffix
1510 @suffix.setter
1512 self._suffix = value
1513
1520
1521 - def find(self, id):
1535
1536 - def get(self, id, model=None):
1547
1549 """
1550 A custom PDB data source. Functions as a user-defined map of structure
1551 identifiers and their corresponding local file names.
1552
1553 @param files: initialization dictionary of id:file pairs
1554 @type files: dict-like
1555 """
1556
1558
1559 self._files = {}
1560 for id in files:
1561 self.add(id, files[id])
1562
1563 @property
1565 """
1566 List of currently registered file names
1567 @rtype: tuple
1568 """
1569 return tuple(self._files.values())
1570
1571 @property
1573 """
1574 List of currently registered structure identifiers
1575 @rtype: tuple
1576 """
1577 return tuple(self._files)
1578
1579 - def add(self, id, path):
1580 """
1581 Register a new local C{id}:C{path} pair.
1582
1583 @param id: structure identifier
1584 @type id: str
1585 @param path: path and file name
1586 @type path: str
1587
1588 @raise IOError: if C{path} is not a valid file name
1589 """
1590 if os.path.isfile(path):
1591 self._files[id] = path
1592 else:
1593 raise IOError(path)
1594
1596 """
1597 Unregister an existing structure C{id}.
1598
1599 @param id: structure identifier
1600 @type id: str
1601
1602 @raise ValueError: if C{id} had not been registered
1603 """
1604 if id not in self._files:
1605 raise ValueError(id)
1606 else:
1607 del self._files[id]
1608
1609 - def find(self, id):
1610
1611 if id in self._files:
1612 return self._files[id]
1613 else:
1614 return None
1615
1616 -def get(accession, model=None, prefix='http://www.rcsb.org/pdb/files/pdb'):
1617 """
1618 Download and parse a PDB entry.
1619
1620 @param accession: accession number of the entry
1621 @type accession: str
1622 @param model: model identifier
1623 @type model: str
1624 @param prefix: download URL prefix
1625 @type prefix: str
1626
1627 @return: object representation of the selected model
1628 @rtype: L{Structure}
1629 """
1630 return RemoteStructureProvider(prefix).get(accession, model=model)
1631
1632 -def find(id, paths):
1633 """
1634 Try to discover a PDB file for PDB C{id} in C{paths}.
1635
1636 @param id: PDB ID of the entry
1637 @type id: str
1638 @param paths: a list of directories to scan
1639 @type paths: list of str
1640
1641 @return: path and file name on success, None otherwise
1642 @rtype: str
1643 """
1644 return FileSystemStructureProvider(paths).find(id)
1645
1648
1649 - def __init__(self, result, exception):
1650
1651 self.result = result
1652 self.exception = exception
1653
1655 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
1656
1661
1664 """
1665 Wraps StructureParser in an asynchronous call. Since a new process is
1666 started by Python internally (as opposed to only starting a new thread),
1667 this makes the parser slower, but provides a way to set a parse timeout
1668 limit.
1669
1670 If initialized with more than one worker, supports parallel parsing
1671 through the C{self.parse_async} method.
1672
1673 @param workers: number of worker threads (1 by default)
1674 @type workers: int
1675 """
1676
1678
1679 self._pool = None
1680 self._workers = 1
1681
1682 if int(workers) > 0:
1683 self._workers = int(workers)
1684 else:
1685 raise ValueError(workers)
1686
1687 self._recycle()
1688
1690
1691 if self._pool:
1692 self._pool.terminate()
1693
1694 self._pool = multiprocessing.Pool(processes=self._workers)
1695
1698 """
1699 Call StructureParser.parse_structure() in a separate process and return
1700 the output. Raise TimeoutError if the parser does not respond within
1701 C{timeout} seconds.
1702
1703 @param structure_file: structure file to parse
1704 @type structure_file: str
1705 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
1706 elapse before the parser completes its job
1707 @type timeout: int
1708 @param parser: any implementing L{AbstractStructureParser} class
1709 @type parser: type
1710
1711 @return: parsed structure
1712 @rtype: L{csb.structure.Structure}
1713 """
1714
1715 r = self.parse_async([structure_file], timeout, model, parser)
1716 if len(r) > 0:
1717 if r[0].exception is not None:
1718 raise r[0].exception
1719 else:
1720 return r[0].result
1721 return None
1722
1725 """
1726 Call C{self.parse_structure} for a list of structure files
1727 simultaneously. The actual degree of parallelism will depend on the
1728 number of workers specified while constructing the parser object.
1729
1730 @note: Don't be tempted to pass a large list of structures to this
1731 method. Every time a C{TimeoutError} is encountered, the
1732 corresponding worker process in the pool will hang until the
1733 process terminates on its own. During that time, this worker is
1734 unusable. If a sufficiently high number of timeouts occur, the
1735 whole pool of workers will be unsable. At the end of the method
1736 however a pool cleanup is performed and any unusable workers
1737 are 'reactivated'. However, that only happens at B{the end} of
1738 C{parse_async}.
1739
1740 @param structure_files: a list of structure files
1741 @type structure_files: tuple of str
1742 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
1743 elapse before the parser completes its job
1744 @type timeout: int
1745 @param parser: any implementing L{AbstractStructureParser} class
1746 @type parser: type
1747
1748 @return: a list of L{AsyncParseResult} objects
1749 @rtype: list
1750 """
1751
1752 pool = self._pool
1753 workers = []
1754 results = []
1755
1756 for file in list(structure_files):
1757 result = pool.apply_async(_parse_async, [parser, file, model])
1758 workers.append(result)
1759
1760 hanging = False
1761 for w in workers:
1762 result = AsyncParseResult(None, None)
1763 try:
1764 result.result = w.get(timeout=timeout)
1765 except KeyboardInterrupt as ki:
1766 pool.terminate()
1767 raise ki
1768 except Exception as ex:
1769 result.exception = ex
1770 if isinstance(ex, multiprocessing.TimeoutError):
1771 hanging = True
1772 results.append(result)
1773
1774 if hanging:
1775 self._recycle()
1776
1777 return results
1778