1 """
2 PDB structure parsers, format builders and database providers.
3
4 The most basic usage is:
5
6 >>> parser = StructureParser('structure.pdb')
7 >>> parser.parse_structure()
8 <Structure> # a Structure object (model)
9
10 or if this is an NMR ensemble:
11
12 >>> parser.parse_models()
13 <Ensemble> # an Ensemble object (collection of alternative Structure-s)
14
15 This module introduces a family of PDB file parsers. The common interface of all
16 parsers is defined in L{AbstractStructureParser}. This class has several
17 implementations:
18
19 - L{RegularStructureParser} - handles normal PDB files with SEQRES fields
20 - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files,
21 which are lacking SEQRES records (initializes all residues from the ATOMs instead)
22 - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures
23 without coordinates. Useful for reading metadata (e.g. accession numbers or just
24 plain SEQRES sequences) with minimum overhead
25
26 Unless you have a special reason, you should use the L{StructureParser} factory,
27 which returns a proper L{AbstractStructureParser} implementation, depending on the
28 input PDB file. If the input file looks like a regular PDB file, the factory
29 returns a L{RegularStructureParser}, otherwise it instantiates L{LegacyStructureParser}.
30 L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}.
31
32 Another important abstraction in this module is L{StructureProvider}. It has several
33 implementations which can be used to retrieve PDB L{Structure}s from various sources:
34 file system directories, remote URLs, etc. You can easily create your own provider as
35 well. See L{StructureProvider} for details.
36
37 Finally, this module gives you some L{FileBuilder}s, used for text serialization
38 of L{Structure}s and L{Ensemble}s:
39
40 >>> builder = PDBFileBuilder(stream)
41 >>> builder.add_header(structure)
42 >>> builder.add_structure(structure)
43
44 where stream is any Python stream, e.g. an open file or sys.stdout.
45
46 See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these
47 objects.
48 """
49
50 import re
51 import os
52 import numpy
53 import datetime
54 import multiprocessing
55
56 import csb.bio.structure
57 import csb.core
58 import csb.io
59
60 from abc import ABCMeta, abstractmethod
61 from csb.bio.sequence import SequenceTypes, SequenceAlphabets
62 from csb.bio.structure import ChemElements, SecStructures
63
64
65 PDB_AMINOACIDS = {
66 'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN',
67 'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY',
68 'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS',
69 'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS',
70 'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER',
71 'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN',
72 'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU',
73 'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER',
74 'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG',
75 'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU',
76 'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER',
77 'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU',
78 'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR',
79 'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA',
80 'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET',
81 'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY',
82 'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE',
83 'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS',
84 'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR',
85 '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL',
86 'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS',
87 'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS',
88 'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP',
89 'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR',
90 'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU',
91 'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS',
92 'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY',
93 'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS',
94 'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS',
95 'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS',
96 'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS',
97 'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG',
98 'DNP': 'ALA'
99 }
100 """
101 Dictionary of non-standard amino acids, which could be found in PDB.
102 """
103
104
105 PDB_NUCLEOTIDES = {
106 'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine',
107 'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine',
108 'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine',
109 'K': 'Ketone', ' M': 'Amino', 'S': 'Strong', 'W': 'Weak',
110 'B': 'NotA', 'D' : 'NotC', 'H': 'NotG', 'V': 'NotT',
111 'N': 'Any', 'X' : 'Masked'
112 }
113 """
114 Dictionary of non-standard nucleotides, which could be found in PDB.
115 """
119
122
125
128
131
134
135 -class InvalidEntryIDError(StructureFormatError):
137
138
139 -class EntryID(object):
140 """
141 Represents a PDB Chain identifier. Implementing classes must define
142 how the original ID is split into accession number and chain ID.
143
144 @param id: identifier
145 @type id: str
146 """
147 __metaclass__ = ABCMeta
148
149 - def __init__(self, id):
150
151 self._accession = ''
152 self._chain = ''
153
154 id = id.strip()
155 self._accession, self._chain = self.parse(id)
156
157 @staticmethod
159 """
160 Guess the format of C{id} and parse it.
161
162 @return: a new PDB ID of the appropriate type
163 @rtype: L{EntryID}
164 """
165
166 if len(id) in (4, 5):
167 return StandardID(id)
168 elif len(id) == 6 and id[4] == '_':
169 return SeqResID(id)
170 else:
171 return DegenerateID(id)
172
173 @abstractmethod
174 - def parse(self, id):
175 """
176 Split C{id} into accession number and chain ID.
177
178 @param id: PDB identifier
179 @type id: str
180 @return: (accession, chain)
181 @rtype: tuple of str
182
183 @raise InvalidEntryIDError: when C{id} is in an inappropriate format
184 """
185 pass
186
188 """
189 @return: the identifier in its original format
190 @rtype: str
191 """
192 return self.entry_id
193
194 @property
195 - def accession(self):
196 """
197 Accession number part of the Entry ID
198 @rtype: str
199 """
200 return self._accession
201
202 @property
204 """
205 Chain ID part of the Entry ID
206 @rtype: str
207 """
208 return self._chain
209
210 @property
211 - def entry_id(self):
212 """
213 Accession number + Chain ID
214 @rtype: str
215 """
216 return "{0.accession}{0.chain}".format(self)
217
220
222 """
223 Standard PDB ID in the following form: xxxxY, where xxxx is the accession
224 number (lower case) and Y is an optional chain identifier.
225 """
232
234 """
235 Looks like a L{StandardID}, except that the accession number may have
236 arbitrary length.
237 """
244
246 """
247 Same as a L{StandardID}, but contains an additional underscore between
248 te accession number and the chain identifier.
249 """
256
259
262 """
263 A base PDB structure format-aware parser. Subclasses must implement the
264 internal abstract method C{_parse_header} in order to complete the
265 implementation.
266
267 @param structure_file: the input PD file to parse
268 @type structure_file: str
269 @param check_ss: if True, secondary structure errors in the file will cause
270 L{SecStructureFormatError} exceptions
271 @type check_ss: bool
272
273 @raise IOError: when the input file cannot be found
274 """
275
276 __metaclass__ = ABCMeta
277
278 @staticmethod
280 """
281 A StructureParser factory, which instantiates and returns the proper parser
282 object based on the contents of the PDB file.
283
284 If the file contains a SEQRES section, L{RegularStructureParser} is returned,
285 otherwise L{LegacyStructureParser} is instantiated. In the latter case
286 LegacyStructureParser will read the sequence data directly from the ATOMs.
287
288 @param structure_file: the PDB file to parse
289 @type structure_file: str
290
291 @rtype: L{AbstractStructureParser}
292 """
293 has_seqres = False
294
295 for line in open(structure_file):
296 if line.startswith('SEQRES'):
297 has_seqres = True
298 break
299
300 if has_seqres:
301 return RegularStructureParser(structure_file)
302 else:
303 return LegacyStructureParser(structure_file)
304
305 - def __init__(self, structure_file, check_ss=False):
306
307 self._file = None
308 self._stream = None
309 self._check_ss = bool(check_ss)
310
311 self.filename = structure_file
312
314 try:
315 self._stream.close()
316 except:
317 pass
318
319 @property
321 """
322 Current input PDB file name
323 @rtype: str
324 """
325 return self._file
326 @filename.setter
328 try:
329 stream = open(name)
330 except IOError:
331 raise IOError('File not found: {0}'.format(name))
332
333 if self._stream:
334 try:
335 self._stream.close()
336 except:
337 pass
338 self._stream = stream
339 self._file = name
340
342 """
343 Find all available model identifiers in the structure.
344
345 @return: a list of model IDs
346 @rtype: list
347 """
348 models = []
349 check = set()
350
351 with open(self._file, 'r') as f:
352 for line in f:
353 if line.startswith('MODEL'):
354 model_id = int(line[10:14])
355 if model_id in check:
356 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id))
357 models.append(model_id)
358 check.add(model_id)
359
360 if len(models) > 0:
361 if not(min(check) == 1 and max(check) == len(models)):
362 raise StructureFormatError('Non-consecutive model identifier(s) encountered')
363 return models
364 else:
365 return []
366
368 """
369 Try to guess what is the sequence type of a PDB C{residue_name}.
370
371 @param residue_name: a PDB-conforming name of a residue
372 @type residue_name: str
373
374 @return: a L{SequenceTypes} enum member
375 @rtype: L{csb.core.EnumItem}
376
377 @raise UnknownPDBResidueError: when there is no such PDB residue name
378 in the catalog tables
379 """
380 if residue_name in PDB_AMINOACIDS:
381 return SequenceTypes.Protein
382 elif residue_name in PDB_NUCLEOTIDES:
383 return SequenceTypes.NucleicAcid
384 else:
385 raise UnknownPDBResidueError(residue_name)
386
388 """
389 Try to parse a PDB C{residue_name} and return its closest 'normal'
390 string representation. If a sequence type (C{as_type}) is defined,
391 guess the alphabet based on that information, otherwise try first to
392 parse it as a protein residue.
393
394 @param residue_name: a PDB-conforming name of a residue
395 @type residue_name: str
396 @param as_type: suggest a sequence type (L{SequenceTypes} member)
397 @type L{scb.core.EnumItem}
398
399 @return: a normalized residue name
400 @rtype: str
401
402 @raise UnknownPDBResidueError: when there is no such PDB residue name
403 in the catalog table(s)
404 """
405 if as_type is None:
406 as_type = self.guess_sequence_type(residue_name)
407
408 try:
409 if as_type == SequenceTypes.Protein:
410 return PDB_AMINOACIDS[residue_name]
411 elif as_type == SequenceTypes.NucleicAcid:
412 return PDB_NUCLEOTIDES[residue_name]
413 else:
414 raise UnknownPDBResidueError(residue_name)
415 except KeyError:
416 raise UnknownPDBResidueError(residue_name)
417
440
441 - def parse(self, filename=None, model=None):
446
448 """
449 Parse and return the L{Structure} with the specified model identifier.
450 If no explicit model is specified, parse the first model in the
451 structure.
452
453 @param model: parse exactly the model with this ID
454 @type model: str
455
456 @return: object representation of the selected model
457 @rtype: L{Structure}
458
459 @raise ValueError: When an invalid model ID is specified
460 @raise PDBParseError: When the input PDB file suffers from unrecoverable
461 corruption. More specialized exceptions will be
462 raised depending on the context (see L{PDBParseError}'s
463 subclasses).
464 """
465 if model is not None:
466 model = int(model)
467
468 structure = self._parse_header(model)
469
470 self._parse_atoms(structure, model)
471 self._parse_ss(structure)
472
473 return structure
474
504
506 """
507 Parse and return the L{Structure} of the biological unit (quaternary
508 structure) as annotated by the REMARK 350 BIOMOLECULE record.
509
510 @param number: biomolecule number
511 @type number: int
512
513 @param single: if True, assign new single-letter chain
514 identifiers. If False, assign multi-letter chain identifiers whith a
515 number appended to the original identifier, like "A1", "A2", ...
516 @type single: bool
517
518 @return: structure of biological unit
519 @rtype: L{Structure}
520 """
521 remarks = self._parse_remarks()
522 if 350 not in remarks:
523 raise PDBParseError('There is no REMARK 350')
524
525 current = 1
526 biomt = {current: {}}
527 chains = tuple()
528
529 def split(line):
530 return [c.strip() for c in line.split(',') if c.strip() != '']
531
532 for line in remarks[350]:
533 if line.startswith('BIOMOLECULE:'):
534 current = int(line[12:])
535 biomt[current] = {}
536 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'):
537 chains = tuple(split(line[30:]))
538 elif line.startswith(' AND CHAINS:'):
539 chains += tuple(split(line[30:]))
540 elif line.startswith(' BIOMT'):
541 num = int(line[8:12])
542 vec = line[12:].split()
543 vec = list(map(float, vec))
544 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec)
545
546 if number not in biomt or len(biomt[number]) == 0:
547 raise KeyError('no BIOMOLECULE number {0}'.format(number))
548
549 asu = self.parse_structure()
550 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number))
551
552 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789')
553
554 for chains, matrices in biomt[number].items():
555 for num in matrices:
556 mat = numpy.array(matrices[num][0:12]).reshape((3,4))
557 R, t = mat[:3,:3], mat[:3,3]
558
559 for chain in chains:
560 if chain not in asu:
561 raise PDBParseError('chain {0} missing'.format(chain))
562 copy = asu[chain].clone()
563 copy.transform(R, t)
564 if single:
565 if len(structure.chains) == 62:
566 raise ValueError('too many chains for single=True')
567 copy.id = next(newchainiditer)
568 else:
569 copy.id = '{0}{1}'.format(chain, num)
570 structure.chains.append(copy)
571
572 return structure
573
574 @abstractmethod
576 """
577 An abstract method, which subclasses must use to customize the way the
578 PDB header (or the absence of a such) is handled. The implementation
579 must rely on reading character data from the current internal
580 self._stream and must return a new L{csb.bio.structure.Structure}
581 instance with properly initialized header data: accession, model,
582 molecule identifiers, chains and residues. This structure object is
583 then internally passed to the C{_parse_atoms} hook, responsible for
584 attachment of the atoms to the residues in the structure.
585 """
586 pass
587
603
605 """
606 Parse the ATOMs from the specified C{model} and attach them to the
607 C{structure}.
608 """
609
610 structure.model_id = None
611
612 atoms = dict( (chain, []) for chain in structure.chains )
613 chains = set()
614 total_chains = len([c for c in structure.items if c.length > 0])
615 het_residues = dict( (chain, set()) for chain in structure.chains )
616 in_ligands = False
617 in_atom = False
618
619 self._stream.seek(0)
620 while True:
621
622 try:
623 line = next(self._stream)
624 except StopIteration:
625 break
626
627 if line.startswith('HET '):
628 het_residue, het_chain, het_residue_id = line[7:10].strip(), line[12], line[13:18].strip()
629
630 if het_chain in structure:
631 chain = structure.chains[het_chain]
632 if chain.type == SequenceTypes.Protein and het_residue in PDB_AMINOACIDS:
633 het_residues[het_chain].add(het_residue_id)
634 elif chain.type == SequenceTypes.NucleicAcid and het_residue in PDB_NUCLEOTIDES:
635 het_residues[het_chain].add(het_residue_id)
636
637 elif line.startswith('MODEL'):
638 if model and model != int(line[10:14]):
639 self._scroll_model(model, self._stream)
640 structure.model_id = model
641 else:
642 model = int(line[10:14])
643 structure.model_id = model
644
645 elif line.startswith('ATOM') \
646 or (line.startswith('HETATM') and not in_ligands):
647 in_atom = True
648
649 rank = int(line[22:26])
650 serial_number = int(line[6:11])
651 name = line[12:16]
652 x, y, z = line[30:38], line[38:46], line[46:54]
653 vector = numpy.array([float(x), float(y), float(z)])
654 element = line[76:78].strip()
655 if element:
656 try:
657 element = csb.core.Enum.parsename(ChemElements, element)
658 except csb.core.EnumMemberError:
659 if element in ('D', 'X'):
660 element = ChemElements.x
661 else:
662 raise StructureFormatError('Unknown chemical element: {0}'.format(element))
663 else:
664 element = None
665
666 atom = csb.bio.structure.Atom(serial_number, name, element,
667 vector)
668
669 atom._het = line.startswith('HETATM')
670 atom._rank = rank
671 atom._sequence_number = int(line[22:26].strip())
672 atom._residue_id = str(atom._sequence_number)
673 atom._insertion_code = line[26].strip()
674 if not atom._insertion_code:
675 atom._insertion_code = None
676 else:
677 atom._residue_id += atom._insertion_code
678
679 atom.alternate = line[16].strip()
680 if not atom.alternate:
681 atom.alternate = None
682
683 atom._chain = line[21].strip()
684 if atom._chain not in structure.chains:
685 raise StructureFormatError(
686 'Atom {0}: chain {1} is undefined'.format(serial_number, atom._chain))
687 chains.add(atom._chain)
688 residue_name = line[17:20].strip()
689 residue_name = self.parse_residue_safe(residue_name, as_type=structure.chains[atom._chain].type)
690 if structure.chains[atom._chain].type == SequenceTypes.NucleicAcid:
691 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Nucleic, residue_name)
692 else:
693 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Protein, residue_name)
694
695 atom.occupancy = float(line[54:60].strip() or 0)
696 atom.bfactor = float(line[60:66].strip() or 0)
697
698 charge = line[78:80].strip()
699 if charge:
700 if charge in ('+', '-'):
701 charge += '1'
702 if charge[-1] in ('+', '-'):
703 charge = charge[::-1]
704 atom.charge = int(charge)
705
706 atoms[atom._chain].append(atom)
707
708 elif in_atom and line.startswith('TER'):
709 in_atom = False
710 if len(chains) == total_chains:
711 in_ligands = True
712
713 elif line.startswith('ENDMDL'):
714 break
715
716 elif line.startswith('END'):
717 break
718
719 if structure.model_id != model:
720 raise ValueError('No such model {0} in the structure.'.format(model))
721
722 self._map_residues(structure, atoms, het_residues)
723
725
726 assert set(structure.chains) == set(atoms.keys())
727
728 for chain in structure.chains:
729
730 subject = structure.chains[chain].sequence
731 query = ''
732 fragments = []
733
734 seq_numbers = []
735 lookup = {}
736
737 i = -1
738 for a in atoms[chain]:
739 if a._residue_id not in lookup:
740 i += 1
741 lookup[a._residue_id] = [a._sequence_number, a._insertion_code]
742 seq_numbers.append(a._residue_id)
743 res_name = a._residue_name.value
744 res_id = '{0}{1}'.format(a._sequence_number or '', a._insertion_code or '').strip()
745 if a._het and res_id not in het_residues[chain]:
746
747 fragments.append([res_name, '?'])
748 elif a._insertion_code and not (i > 0 and lookup[seq_numbers[i - 1]][1]):
749 fragments.append([res_name])
750 elif i == 0 or a._sequence_number - lookup[seq_numbers[i - 1]][0] not in (0, 1, -1):
751
752 fragments.append([res_name])
753 else:
754
755 if fragments[-1][-1] == '?':
756
757
758 fragments.append([res_name])
759 else:
760
761 fragments[-1].append(res_name)
762
763 for i, frag in enumerate(fragments):
764 fragments[i] = ''.join(frag)
765 if len(fragments) > 100:
766
767 raise StructureFormatError("Can't map structure with more than 100 fragments in ATOMS")
768 query = '^.*?({0}).*?$'.format(').*?('.join(fragments))
769
770 matches = re.match(query, subject)
771
772 if matches:
773 seq_numitem = -1
774 for frag_number, frag in enumerate(matches.groups(), start=1):
775 if frag is '':
776
777
778 seq_numitem += 1
779 else:
780 for i, dummy in enumerate(frag, start=1):
781 seq_numitem += 1
782
783 try:
784 lookup[ seq_numbers[seq_numitem] ] = matches.start(frag_number) + i
785 except:
786 raise
787
788 fixed_residue = None
789 for atom in atoms[chain]:
790 if not isinstance(lookup[atom._residue_id], int):
791 continue
792 atom._rank = lookup[atom._residue_id]
793 residue = structure.chains[chain].residues[atom._rank]
794 if residue is not fixed_residue:
795 residue.id = atom._sequence_number, atom._insertion_code
796 fixed_residue = residue
797
798 assert str(residue.type) == subject[atom._rank - 1]
799 residue.atoms.append(atom)
800
801 del atom._rank
802 del atom._insertion_code
803 del atom._sequence_number
804 del atom._chain
805 del atom._residue_id
806 del atom._residue_name
807 else:
808 if structure.chains[chain].length == 0 and len(atoms[chain]) > 0:
809 raise StructureFormatError("Can't add atoms: chain {0} has no residues".format(chain))
810 else:
811 raise StructureFormatError("Can't map atoms")
812
814 """
815 Parse and attach secondary structure data.
816
817 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated
818 as a regular SecStructures.Helix. This is due to incompatibility
819 between DSSP and PDB helix types.
820 @todo: Implement a proper workaround for the previous bug (e.g. skip all
821 helices types not included in the DSSP enum)
822
823 @warning: In this implementation only the start/end positions of the SS
824 elements are parsed. Additional data like H-bonding is ignored.
825
826 @bug: Currently structure.to_pdb() is not writing any SS data.
827 """
828 elements = {}
829 self._stream.seek(0)
830
831 while True:
832 try:
833 line = next(self._stream)
834 except StopIteration:
835 break
836
837 if line.startswith('HELIX'):
838
839 chain = structure.chains[line[19].strip()]
840 if chain.id not in elements:
841 elements[chain.id] = []
842 if chain.id != line[31].strip():
843 if self._check_ss:
844 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10]))
845 else:
846 continue
847 try:
848 startres = chain.find(line[21:25].strip(), line[25].strip())
849 endres = chain.find(line[33:37].strip(), line[37].strip())
850 except csb.core.ItemNotFoundError as ex:
851 if self._check_ss:
852 raise SecStructureFormatError(
853 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
854 else:
855 continue
856 if not startres.rank <= endres.rank:
857 if self._check_ss:
858 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10]))
859 else:
860 continue
861 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix)
862 elements[chain.id].append(helix)
863
864 if line.startswith('SHEET'):
865
866 chain = structure.chains[line[21].strip()]
867 if chain.id not in elements:
868 elements[chain.id] = []
869 if chain.id != line[32].strip():
870 if self._check_ss:
871 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10]))
872 else:
873 continue
874 try:
875 startres = chain.find(line[22:26].strip(), line[26].strip())
876 endres = chain.find(line[33:37].strip(), line[37].strip())
877 except csb.core.ItemNotFoundError as ex:
878 if self._check_ss:
879 raise SecStructureFormatError(
880 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
881 else:
882 continue
883 if not startres.rank <= endres.rank:
884 if self._check_ss:
885 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10]))
886 else:
887 continue
888 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand)
889 elements[chain.id].append(strand)
890
891 elif line.startswith('MODEL') or line.startswith('ATOM'):
892 break
893
894 for chain_id in elements:
895 ss = csb.bio.structure.SecondaryStructure()
896 for e in elements[chain_id]:
897 ss.append(e)
898 structure.chains[chain_id].secondary_structure = ss
899
901 """
902 Read REMARK lines from PDB file.
903
904 @return: dictionary with remark numbers as keys, and lists of lines as values.
905 @rtype: dict
906 """
907 self._stream.seek(0)
908
909 remarks = {}
910
911 for line in self._stream:
912 if line.startswith('REMARK'):
913 num = int(line[7:10])
914 lstring = line[11:]
915 remarks.setdefault(num, []).append(lstring)
916 elif line.startswith('DBREF') or line.startswith('ATOM'):
917 break
918
919 return remarks
920
923 """
924 This is the de facto PDB parser, which is designed to read SEQRES and ATOM
925 sections separately, and them map them. Intentionally fails to parse
926 malformed PDB files, e.g. a PDB file without a HEADER section.
927 """
928
930 """
931 Parse the HEADER section of a regular PDB file.
932
933 @return: a L{csb.bio.structure.Structure} instance with properly
934 initialized residues from the SEQRES.
935 @rtype: L{csb.bio.structure.Structure}
936
937 @raise PDBParseError: if the stream has no HEADER at byte 0
938 """
939
940 self._stream.seek(0)
941
942 header = next(self._stream)
943 if not header.startswith('HEADER'):
944 raise PDBParseError('Does not look like a regular PDB file.')
945
946 structure = csb.bio.structure.Structure(header.split()[-1])
947
948 while True:
949
950 try:
951 line = next(self._stream)
952 except StopIteration:
953 break
954
955 if line.startswith('COMPND'):
956 if line[10:].lstrip().startswith('MOL_ID:'):
957 mol_id = int(line[18:].replace(';', '').strip())
958 chain_name = ''
959 chains = ''
960 while line.startswith('COMPND'):
961 line = next(self._stream)
962 if line.split()[2].startswith('MOLECULE:'):
963 chain_name += line[20:].strip()
964 while not chain_name.endswith(';'):
965 line = next(self._stream)
966 if not line.startswith('COMPND'):
967 break
968 chain_name += ' ' + line[11:].strip()
969 else:
970 while not line.split()[2].startswith('CHAIN:'):
971 line = next(self._stream)
972 if not line.startswith('COMPND'):
973 raise HeaderFormatError('Missing chain identifier in COMPND section')
974 chains = line[17:].strip()
975 while not chains.endswith(';'):
976 line = next(self._stream)
977 if not line.startswith('COMPND'):
978 break
979 chains += ', ' + line[11:].strip()
980 break
981
982 chain_name = chain_name.strip()[:-1]
983 for chain in chains.replace(';', ' ').replace(',', ' ').split() or ['']:
984 new_chain = csb.bio.structure.Chain(chain, type=SequenceTypes.Unknown,
985 name=chain_name, accession=structure.accession)
986 new_chain.molecule_id = mol_id
987 try:
988 structure.chains.append(new_chain)
989 except csb.core.DuplicateKeyError:
990 raise HeaderFormatError('Chain {0} is already defined.'.format(new_chain.id))
991
992 elif line.startswith('REMARK 2 RESOLUTION'):
993 res = re.search("(\d+(?:\.\d+)?)\s+ANGSTROM", line)
994 if res and res.groups():
995 structure.resolution = float(res.group(1))
996
997 elif line.startswith('SEQRES'):
998
999 seq_fields = [line[7:10], line[11], line[13:17] ]
1000 seq_fields.extend(line[18:].split())
1001
1002 rank_base = int(seq_fields[0])
1003 chain_id = seq_fields[1].strip()
1004
1005 if chain_id not in structure.chains:
1006 raise HeaderFormatError('Chain {0} is undefined'.format(chain_id))
1007
1008 chain = structure.chains[chain_id]
1009
1010 if chain.type == SequenceTypes.Unknown:
1011 inner_residuerank = int(len(seq_fields[3:]) / 2) + 3
1012 for i in [inner_residuerank, 3, -1]:
1013 try:
1014 chain.type = self.guess_sequence_type(seq_fields[i])
1015 break
1016 except UnknownPDBResidueError:
1017 pass
1018
1019 for i, residue_name in enumerate(seq_fields[3:]):
1020 rank = rank_base * 13 - (13 - (i + 1))
1021 rname = self.parse_residue_safe(residue_name, as_type=chain.type)
1022 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname)
1023 residue._pdb_name = residue_name
1024 structure.chains[chain_id].residues.append(residue)
1025 assert structure.chains[chain_id].residues.last_index == rank
1026
1027 elif line.startswith('MODEL') or line.startswith('ATOM'):
1028 break
1029
1030 return structure
1031
1034 """
1035 Ultra fast PDB HEADER parser. Does not read any structural data.
1036 """
1037
1040
1043
1046
1049 """
1050 This is a customized PDB parser, which is designed to read both sequence and
1051 atom data from the ATOM section. This is especially useful when parsing PDB
1052 files without a header.
1053 """
1054
1149
1150
1151 StructureParser = AbstractStructureParser.create_parser
1152 """
1153 Alias for L{AbstractStructureParser.create_parser}.
1154 """
1157 """
1158 Base abstract files for all structure file formatters.
1159 Defines a common step-wise interface according to the Builder pattern.
1160
1161 @param output: output stream (this is where the product is constructed)
1162 @type output: stream
1163 """
1164
1165 __metaclass__ = ABCMeta
1166
1168
1169 if not hasattr(output, 'write'):
1170 raise TypeError(output)
1171
1172 def isnull(this, that, null=None):
1173 if this is null:
1174 return that
1175 else:
1176 return this
1177
1178 self._out = output
1179 self._isnull = isnull
1180
1181 @property
1183 """
1184 Destination stream
1185 @rtype: stream
1186 """
1187 return self._out
1188
1189 @property
1191 """
1192 ISNULL(X, Y) function
1193 @rtype: callable
1194 """
1195 return self._isnull
1196
1198 """
1199 Write a chunk of text
1200 """
1201 self._out.write(text)
1202
1204 """
1205 Write a chunk of text and append a new line terminator
1206 """
1207 self._out.write(text)
1208 self._out.write('\n')
1209
1210 @abstractmethod
1213
1214 @abstractmethod
1217
1220
1222 """
1223 PDB file format builder.
1224 """
1225
1228
1230 """
1231 Write the HEADER of the file using C{master}
1232
1233 @type master: L{Structure}
1234 """
1235
1236 isnull = self.isnull
1237
1238 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}'
1239 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper()))
1240
1241 molecules = { }
1242
1243 for chain_id in master.chains:
1244 chain = master.chains[chain_id]
1245 if chain.molecule_id not in molecules:
1246 molecules[chain.molecule_id] = [ ]
1247 molecules[chain.molecule_id].append(chain_id)
1248
1249 k = 0
1250 for mol_id in sorted(molecules):
1251
1252 chains = molecules[mol_id]
1253 first_chain = master.chains[ chains[0] ]
1254
1255 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0')))
1256 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, '')))
1257 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains)))
1258 k += 3
1259
1260 for chain_id in master.chains:
1261
1262 chain = master.chains[chain_id]
1263 res = [ r._pdb_name for r in chain.residues ]
1264
1265 rn = 0
1266 for j in range(0, chain.length, 13):
1267 rn += 1
1268 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ]
1269 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format(
1270 rn, chain.id, chain.length, ' '.join(residues) ))
1271
1273 """
1274 Append a new model to the file
1275
1276 @type structure: L{Structure}
1277 """
1278
1279 isnull = self.isnull
1280
1281 for chain_id in structure.chains:
1282
1283 chain = structure.chains[chain_id]
1284 for residue in chain.residues:
1285
1286 atoms = [ ]
1287 for an in residue.atoms:
1288 atom = residue.atoms[an]
1289 if isinstance(atom, csb.bio.structure.DisorderedAtom):
1290 for dis_atom in atom: atoms.append(dis_atom)
1291 else:
1292 atoms.append(atom)
1293 atoms.sort()
1294
1295 for atom in atoms:
1296
1297 alt = atom.alternate
1298 if alt is True:
1299 alt = 'A'
1300 elif alt is False:
1301 alt = ' '
1302
1303 if atom.element:
1304 element = repr(atom.element)
1305 else:
1306 element = ' '
1307 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format(
1308 atom.serial_number, atom._full_name, isnull(alt, ' '),
1309 residue._pdb_name, chain.id,
1310 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '),
1311 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.bfactor, 0.0),
1312 element, isnull(atom.charge, ' ') ))
1313
1314 self.writeline('TER')
1315
1317 """
1318 Add the END marker
1319 """
1320 self.writeline('END')
1321 self._out.flush()
1322
1324 """
1325 Supports serialization of NMR ensembles.
1326
1327 Functions as a simple decorator, which wraps C{add_structure} with
1328 MODEL/ENDMDL records.
1329 """
1330
1338
1341 """
1342 Base class for all PDB data source providers.
1343
1344 Concrete classes need to implement the C{find} method, which abstracts the
1345 retrieval of a PDB structure file by a structure identifier. This is a hook
1346 method called internally by C{get}, but subclasses can safely override both
1347 C{find} and {get} to in order to achieve completely custom behavior.
1348 """
1349
1350 __metaclass__ = ABCMeta
1351
1354
1355 @abstractmethod
1356 - def find(self, id):
1357 """
1358 Attempt to discover a PDB file, given a specific PDB C{id}.
1359
1360 @param id: structure identifier (e.g. 1x80)
1361 @type id: str
1362 @return: path and file name on success, None otherwise
1363 @rtype: str or None
1364 """
1365 pass
1366
1367 - def get(self, id, model=None):
1368 """
1369 Discover, parse and return the PDB structure, corresponding to the
1370 specified C{id}.
1371
1372 @param id: structure identifier (e.g. 1x80)
1373 @type id: str
1374 @param model: optional model identifier
1375 @type model: str
1376
1377 @rtype: L{csb.bio.Structure}
1378
1379 @raise StructureNotFoundError: when C{id} could not be found
1380 """
1381 pdb = self.find(id)
1382
1383 if pdb is None:
1384 raise StructureNotFoundError(id)
1385 else:
1386 return StructureParser(pdb).parse_structure(model=model)
1387
1389 """
1390 Simple file system based PDB data source. Scans a list of local directories
1391 using pre-defined file name templates.
1392
1393 @param paths: a list of paths
1394 @type paths: iterable or str
1395 """
1396
1408
1409 @property
1411 """
1412 Current search paths
1413 @rtype: tuple
1414 """
1415 return tuple(self._paths)
1416
1417 @property
1419 """
1420 Current file name match templates
1421 @rtype: tuple
1422 """
1423 return tuple(self._templates)
1424
1425 - def add(self, path):
1426 """
1427 Register a new local C{path}.
1428
1429 @param path: directory name
1430 @type path: str
1431
1432 @raise IOError: if C{path} is not a valid directory
1433 """
1434 if os.path.isdir(path):
1435 self._paths[path] = path
1436 else:
1437 raise IOError(path)
1438
1440 """
1441 Register a custom file name name C{template}. The template must contain
1442 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent
1443
1444 @param template: pattern
1445 @type template: str
1446 """
1447
1448 if '{id}' not in template:
1449 raise ValueError('Template does not contain an "{id}" macro')
1450
1451 if template not in self._templates:
1452 self._templates.append(template)
1453
1455 """
1456 Unregister an existing local C{path}.
1457
1458 @param path: directory name
1459 @type path: str
1460
1461 @raise ValueError: if C{path} had not been registered
1462 """
1463 if path not in self._paths:
1464 raise ValueError('path not found: {0}'.format(path))
1465
1466 del self._paths[path]
1467
1468 - def find(self, id):
1469
1470 for path in self._paths:
1471 for token in self.templates:
1472 fn = os.path.join(path, token.format(id=id))
1473 if os.path.exists(fn):
1474 return fn
1475
1476 return None
1477
1479 """
1480 Retrieves PDB structures from a specified remote URL.
1481 The URL requested from remote server takes the form: <prefix>/<ID><suffix>
1482
1483 @param prefix: URL prefix, including protocol
1484 @type prefix: str
1485 @param suffix: optional URL suffix (.ent by default)
1486 @type suffix: str
1487 """
1488
1489 - def __init__(self, prefix='http://www.rcsb.org/pdb/files/pdb', suffix='.ent'):
1496
1497 @property
1499 """
1500 Current URL prefix
1501 @rtype: str
1502 """
1503 return self._prefix
1504 @prefix.setter
1506 self._prefix = value
1507
1508 @property
1510 """
1511 Current URL suffix
1512 @rtype: str
1513 """
1514 return self._suffix
1515 @suffix.setter
1517 self._suffix = value
1518
1525
1526 - def find(self, id):
1540
1541 - def get(self, id, model=None):
1552
1554 """
1555 A custom PDB data source. Functions as a user-defined map of structure
1556 identifiers and their corresponding local file names.
1557
1558 @param files: initialization dictionary of id:file pairs
1559 @type files: dict-like
1560 """
1561
1563
1564 self._files = {}
1565 for id in files:
1566 self.add(id, files[id])
1567
1568 @property
1570 """
1571 List of currently registered file names
1572 @rtype: tuple
1573 """
1574 return tuple(self._files.values())
1575
1576 @property
1578 """
1579 List of currently registered structure identifiers
1580 @rtype: tuple
1581 """
1582 return tuple(self._files)
1583
1584 - def add(self, id, path):
1585 """
1586 Register a new local C{id}:C{path} pair.
1587
1588 @param id: structure identifier
1589 @type id: str
1590 @param path: path and file name
1591 @type path: str
1592
1593 @raise IOError: if C{path} is not a valid file name
1594 """
1595 if os.path.isfile(path):
1596 self._files[id] = path
1597 else:
1598 raise IOError(path)
1599
1601 """
1602 Unregister an existing structure C{id}.
1603
1604 @param id: structure identifier
1605 @type id: str
1606
1607 @raise ValueError: if C{id} had not been registered
1608 """
1609 if id not in self._files:
1610 raise ValueError(id)
1611 else:
1612 del self._files[id]
1613
1614 - def find(self, id):
1615
1616 if id in self._files:
1617 return self._files[id]
1618 else:
1619 return None
1620
1621 -def get(accession, model=None, prefix='http://www.rcsb.org/pdb/files/pdb'):
1622 """
1623 Download and parse a PDB entry.
1624
1625 @param accession: accession number of the entry
1626 @type accession: str
1627 @param model: model identifier
1628 @type model: str
1629 @param prefix: download URL prefix
1630 @type prefix: str
1631
1632 @return: object representation of the selected model
1633 @rtype: L{Structure}
1634 """
1635 return RemoteStructureProvider(prefix).get(accession, model=model)
1636
1637 -def find(id, paths):
1638 """
1639 Try to discover a PDB file for PDB C{id} in C{paths}.
1640
1641 @param id: PDB ID of the entry
1642 @type id: str
1643 @param paths: a list of directories to scan
1644 @type paths: list of str
1645
1646 @return: path and file name on success, None otherwise
1647 @rtype: str
1648 """
1649 return FileSystemStructureProvider(paths).find(id)
1650
1653
1654 - def __init__(self, result, exception):
1655
1656 self.result = result
1657 self.exception = exception
1658
1660 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
1661
1666
1669 """
1670 Wraps StructureParser in an asynchronous call. Since a new process is
1671 started by Python internally (as opposed to only starting a new thread),
1672 this makes the parser slower, but provides a way to set a parse timeout
1673 limit.
1674
1675 If initialized with more than one worker, supports parallel parsing
1676 through the C{self.parse_async} method.
1677
1678 @param workers: number of worker threads (1 by default)
1679 @type workers: int
1680 """
1681
1683
1684 self._pool = None
1685 self._workers = 1
1686
1687 if int(workers) > 0:
1688 self._workers = int(workers)
1689 else:
1690 raise ValueError(workers)
1691
1692 self._recycle()
1693
1695
1696 if self._pool:
1697 self._pool.terminate()
1698
1699 self._pool = multiprocessing.Pool(processes=self._workers)
1700
1703 """
1704 Call StructureParser.parse_structure() in a separate process and return
1705 the output. Raise TimeoutError if the parser does not respond within
1706 C{timeout} seconds.
1707
1708 @param structure_file: structure file to parse
1709 @type structure_file: str
1710 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
1711 elapse before the parser completes its job
1712 @type timeout: int
1713 @param parser: any implementing L{AbstractStructureParser} class
1714 @type parser: type
1715
1716 @return: parsed structure
1717 @rtype: L{csb.structure.Structure}
1718 """
1719
1720 r = self.parse_async([structure_file], timeout, model, parser)
1721 if len(r) > 0:
1722 if r[0].exception is not None:
1723 raise r[0].exception
1724 else:
1725 return r[0].result
1726 return None
1727
1730 """
1731 Call C{self.parse_structure} for a list of structure files
1732 simultaneously. The actual degree of parallelism will depend on the
1733 number of workers specified while constructing the parser object.
1734
1735 @note: Don't be tempted to pass a large list of structures to this
1736 method. Every time a C{TimeoutError} is encountered, the
1737 corresponding worker process in the pool will hang until the
1738 process terminates on its own. During that time, this worker is
1739 unusable. If a sufficiently high number of timeouts occur, the
1740 whole pool of workers will be unsable. At the end of the method
1741 however a pool cleanup is performed and any unusable workers
1742 are 'reactivated'. However, that only happens at B{the end} of
1743 C{parse_async}.
1744
1745 @param structure_files: a list of structure files
1746 @type structure_files: tuple of str
1747 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
1748 elapse before the parser completes its job
1749 @type timeout: int
1750 @param parser: any implementing L{AbstractStructureParser} class
1751 @type parser: type
1752
1753 @return: a list of L{AsyncParseResult} objects
1754 @rtype: list
1755 """
1756
1757 pool = self._pool
1758 workers = []
1759 results = []
1760
1761 for file in list(structure_files):
1762 result = pool.apply_async(_parse_async, [parser, file, model])
1763 workers.append(result)
1764
1765 hanging = False
1766 for w in workers:
1767 result = AsyncParseResult(None, None)
1768 try:
1769 result.result = w.get(timeout=timeout)
1770 except KeyboardInterrupt as ki:
1771 pool.terminate()
1772 raise ki
1773 except Exception as ex:
1774 result.exception = ex
1775 if isinstance(ex, multiprocessing.TimeoutError):
1776 hanging = True
1777 results.append(result)
1778
1779 if hanging:
1780 self._recycle()
1781
1782 return results
1783