1 """
2 PDB structure parsers, format builders and database providers.
3
4 The most basic usage is:
5
6 >>> parser = StructureParser('structure.pdb')
7 >>> parser.parse_structure()
8 <Structure> # a Structure object (model)
9
10 or if this is an NMR ensemble:
11
12 >>> parser.parse_models()
13 <Ensemble> # an Ensemble object (collection of alternative Structure-s)
14
15 This module introduces a family of PDB file parsers. The common interface of all
16 parsers is defined in L{AbstractStructureParser}. This class has several
17 implementations:
18
19 - L{RegularStructureParser} - handles normal PDB files with SEQRES fields
20 - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files,
21 which are lacking SEQRES records (initializes all residues from the ATOMs instead)
22 - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures
23 without coordinates. Useful for reading metadata (e.g. accession numbers or just
24 plain SEQRES sequences) with minimum overhead
25
26 Unless you have a special reason, you should use the L{StructureParser} factory,
27 which returns a proper L{AbstractStructureParser} implementation, depending on the
28 input PDB file. If the input file looks like a regular PDB file, the factory
29 returns a L{RegularStructureParser}, otherwise it instantiates L{RegularStructureParser}.
30 L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}.
31
32 Another important abstraction in this module is L{StructureProvider}. It has several
33 implementations which can be used to retrieve PDB L{Structure}s from various sources:
34 file system directories, remote URLs, etc. You can easily create your own provider as
35 well. See L{StructureProvider} for details.
36
37 Finally, this module gives you some L{FileBuilder}s, used for text serialization
38 of L{Structure}s and L{Ensemble}s:
39
40 >>> builder = PDBFileBuilder(stream)
41 >>> builder.add_header(structure)
42 >>> builder.add_structure(structure)
43
44 where stream is any Python stream, e.g. an open file or sys.stdout.
45
46 See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these
47 objects.
48 """
49
50 import re
51 import os
52 import numpy
53 import datetime
54 import multiprocessing
55
56 import csb.bio.structure
57 import csb.core
58 import csb.io
59
60 from abc import ABCMeta, abstractmethod
61 from csb.bio.sequence import SequenceTypes, SequenceAlphabets
62 from csb.bio.structure import ChemElements, SecStructures
63
64
65 PDB_AMINOACIDS = {
66 'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN',
67 'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY',
68 'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS',
69 'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS',
70 'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER',
71 'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN',
72 'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU',
73 'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER',
74 'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG',
75 'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU',
76 'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER',
77 'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU',
78 'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR',
79 'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA',
80 'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET',
81 'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY',
82 'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE',
83 'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS',
84 'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR',
85 '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL',
86 'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS',
87 'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS',
88 'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP',
89 'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR',
90 'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU',
91 'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS',
92 'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY',
93 'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS',
94 'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS',
95 'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS',
96 'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS',
97 'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG',
98 'DNP': 'ALA'
99 }
100 """
101 Dictionary of non-standard amino acids, which could be found in PDB.
102 """
103
104
105 PDB_NUCLEOTIDES = {
106 'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine',
107 'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine',
108 'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine',
109 'K': 'Ketone', ' M': 'Amino', 'S': 'Strong', 'W': 'Weak',
110 'B': 'NotA', 'D' : 'NotC', 'H': 'NotG', 'V': 'NotT',
111 'N': 'Any', 'X' : 'Masked'
112 }
113 """
114 Dictionary of non-standard nucleotides, which could be found in PDB.
115 """
119
122
125
128
131
134
135 -class InvalidEntryIDError(StructureFormatError):
137
138
139 -class EntryID(object):
140 """
141 Represents a PDB Chain identifier. Implementing classes must define
142 how the original ID is split into accession number and chain ID.
143
144 @param id: identifier
145 @type id: str
146 """
147 __metaclass__ = ABCMeta
148
149 - def __init__(self, id):
150
151 self._accession = ''
152 self._chain = ''
153
154 id = id.strip()
155 self._accession, self._chain = self.parse(id)
156
157 @staticmethod
159 """
160 Guess the format of C{id} and parse it.
161
162 @return: a new PDB ID of the appropriate type
163 @rtype: L{EntryID}
164 """
165
166 if len(id) in (4, 5):
167 return StandardID(id)
168 elif len(id) == 6 and id[4] == '_':
169 return SeqResID(id)
170 else:
171 return DegenerateID(id)
172
173 @abstractmethod
174 - def parse(self, id):
175 """
176 Split C{id} into accession number and chain ID.
177
178 @param id: PDB identifier
179 @type id: str
180 @return: (accession, chain)
181 @rtype: tuple of str
182
183 @raise InvalidEntryIDError: when C{id} is in an inappropriate format
184 """
185 pass
186
188 """
189 @return: the identifier in its original format
190 @rtype: str
191 """
192 return self.entry_id
193
194 @property
195 - def accession(self):
196 """
197 Accession number part of the Entry ID
198 @rtype: str
199 """
200 return self._accession
201
202 @property
204 """
205 Chain ID part of the Entry ID
206 @rtype: str
207 """
208 return self._chain
209
210 @property
211 - def entry_id(self):
212 """
213 Accession number + Chain ID
214 @rtype: str
215 """
216 return "{0.accession}{0.chain}".format(self)
217
220
222 """
223 Standard PDB ID in the following form: xxxxY, where xxxx is the accession
224 number (lower case) and Y is an optional chain identifier.
225 """
232
234 """
235 Looks like a L{StandardID}, except that the accession number may have
236 arbitrary length.
237 """
244
246 """
247 Same as a L{StandardID}, but contains an additional underscore between
248 te accession number and the chain identifier.
249 """
256
259
262 """
263 A base PDB structure format-aware parser. Subclasses must implement the
264 internal abstract method C{_parse_header} in order to complete the
265 implementation.
266
267 @param structure_file: the input PD file to parse
268 @type structure_file: str
269 @param check_ss: if True, secondary structure errors in the file will cause
270 L{SecStructureFormatError} exceptions
271 @type check_ss: bool
272
273 @raise IOError: when the input file cannot be found
274 """
275
276 __metaclass__ = ABCMeta
277
278 @staticmethod
280 """
281 A StructureParser factory, which instantiates and returns the proper parser
282 object based on the contents of the PDB file.
283
284 If the file contains a SEQRES section, L{RegularStructureParser} is returned,
285 otherwise L{LegacyStructureParser} is instantiated. In the latter case
286 LegacyStructureParser will read the sequence data directly from the ATOMs.
287
288 @param structure_file: the PDB file to parse
289 @type structure_file: str
290
291 @rtype: L{AbstractStructureParser}
292 """
293 has_seqres = False
294
295 for line in open(structure_file):
296 if line.startswith('SEQRES'):
297 has_seqres = True
298 break
299
300 if has_seqres:
301 return RegularStructureParser(structure_file)
302 else:
303 return LegacyStructureParser(structure_file)
304
305 - def __init__(self, structure_file, check_ss=False):
306
307 self._file = None
308 self._stream = None
309 self._check_ss = bool(check_ss)
310
311 self.filename = structure_file
312
314 try:
315 self._stream.close()
316 except:
317 pass
318
319 @property
321 """
322 Current input PDB file name
323 @rtype: str
324 """
325 return self._file
326 @filename.setter
328 try:
329 stream = open(name)
330 except IOError:
331 raise IOError('File not found: {0}'.format(name))
332
333 if self._stream:
334 try:
335 self._stream.close()
336 except:
337 pass
338 self._stream = stream
339 self._file = name
340
342 """
343 Find all available model identifiers in the structure.
344
345 @return: a list of model IDs
346 @rtype: list
347 """
348 models = []
349 check = set()
350
351 with open(self._file, 'r') as f:
352 for line in f:
353 if line.startswith('MODEL'):
354 model_id = int(line[10:14])
355 if model_id in check:
356 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id))
357 models.append(model_id)
358 check.add(model_id)
359
360 if len(models) > 0:
361 if not(min(check) == 1 and max(check) == len(models)):
362 raise StructureFormatError('Non-consecutive model identifier(s) encountered')
363 return models
364 else:
365 return [None]
366
368 """
369 Try to guess what is the sequence type of a PDB C{residue_name}.
370
371 @param residue_name: a PDB-conforming name of a residue
372 @type residue_name: str
373
374 @return: a L{SequenceTypes} enum member
375 @rtype: L{csb.core.EnumItem}
376
377 @raise UnknownPDBResidueError: when there is no such PDB residue name
378 in the catalog tables
379 """
380 if residue_name in PDB_AMINOACIDS:
381 return SequenceTypes.Protein
382 elif residue_name in PDB_NUCLEOTIDES:
383 return SequenceTypes.NucleicAcid
384 else:
385 raise UnknownPDBResidueError(residue_name)
386
388 """
389 Try to parse a PDB C{residue_name} and return its closest 'normal'
390 string representation. If a sequence type (C{as_type}) is defined,
391 guess the alphabet based on that information, otherwise try first to
392 parse it as a protein residue.
393
394 @param residue_name: a PDB-conforming name of a residue
395 @type residue_name: str
396 @param as_type: suggest a sequence type (L{SequenceTypes} member)
397 @type L{scb.core.EnumItem}
398
399 @return: a normalized residue name
400 @rtype: str
401
402 @raise UnknownPDBResidueError: when there is no such PDB residue name
403 in the catalog table(s)
404 """
405 if as_type is None:
406 as_type = self.guess_sequence_type(residue_name)
407
408 try:
409 if as_type == SequenceTypes.Protein:
410 return PDB_AMINOACIDS[residue_name]
411 elif as_type == SequenceTypes.NucleicAcid:
412 return PDB_NUCLEOTIDES[residue_name]
413 else:
414 raise UnknownPDBResidueError(residue_name)
415 except KeyError:
416 raise UnknownPDBResidueError(residue_name)
417
440
441 - def parse(self, filename=None, model=None):
446
448 """
449 Parse and return the L{Structure} with the specified model identifier.
450 If no explicit model is specified, parse the first model in the
451 structure.
452
453 @param model: parse exactly the model with this ID
454 @type model: str
455
456 @return: object representation of the selected model
457 @rtype: L{Structure}
458
459 @raise ValueError: When an invalid model ID is specified
460 @raise PDBParseError: When the input PDB file suffers from unrecoverable
461 corruption. More specialized exceptions will be
462 raised depending on the context (see L{PDBParseError}'s
463 subclasses).
464 """
465 if model is not None:
466 model = int(model)
467
468 structure = self._parse_header(model)
469
470 self._parse_atoms(structure, model)
471 self._parse_ss(structure)
472
473 return structure
474
476 """
477 Parse the specified models in the file and build an L{Ensemble}.
478
479 @param models: an iterable object providing model identifiers.
480 If not specified, all models will be parsed.
481 @type models: tuple
482
483 @return: an ensemble with all parsed models
484 @rtype: L{Ensemble}
485 """
486
487 if not models:
488 models = self.models()
489 else:
490 models = map(int, models)
491
492 ensemble = csb.bio.structure.Ensemble()
493 for model_id in models:
494 model = self.parse_structure(model_id)
495 ensemble.models.append(model)
496
497 return ensemble
498
500 """
501 Parse and return the L{Structure} of the biological unit (quaternary
502 structure) as annotated by the REMARK 350 BIOMOLECULE record.
503
504 @param number: biomolecule number
505 @type number: int
506
507 @param single: if True, assign new single-letter chain
508 identifiers. If False, assign multi-letter chain identifiers whith a
509 number appended to the original identifier, like "A1", "A2", ...
510 @type single: bool
511
512 @return: structure of biological unit
513 @rtype: L{Structure}
514 """
515 remarks = self._parse_remarks()
516 if 350 not in remarks:
517 raise PDBParseError('There is no REMARK 350')
518
519 current = 1
520 biomt = {current: {}}
521 chains = tuple()
522
523 def split(line):
524 return [c.strip() for c in line.split(',') if c.strip() != '']
525
526 for line in remarks[350]:
527 if line.startswith('BIOMOLECULE:'):
528 current = int(line[12:])
529 biomt[current] = {}
530 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'):
531 chains = tuple(split(line[30:]))
532 elif line.startswith(' AND CHAINS:'):
533 chains += tuple(split(line[30:]))
534 elif line.startswith(' BIOMT'):
535 num = int(line[8:12])
536 vec = line[12:].split()
537 vec = list(map(float, vec))
538 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec)
539
540 if number not in biomt or len(biomt[number]) == 0:
541 raise KeyError('no BIOMOLECULE number {0}'.format(number))
542
543 asu = self.parse_structure()
544 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number))
545
546 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789')
547
548 for chains, matrices in biomt[number].items():
549 for num in matrices:
550 mat = numpy.array(matrices[num][0:12]).reshape((3,4))
551 R, t = mat[:3,:3], mat[:3,3]
552
553 for chain in chains:
554 if chain not in asu:
555 raise PDBParseError('chain {0} missing'.format(chain))
556 copy = asu[chain].clone()
557 copy.transform(R, t)
558 if single:
559 if len(structure.chains) == 62:
560 raise ValueError('too many chains for single=True')
561 copy.id = next(newchainiditer)
562 else:
563 copy.id = '{0}{1}'.format(chain, num)
564 structure.chains.append(copy)
565
566 return structure
567
568 @abstractmethod
570 """
571 An abstract method, which subclasses must use to customize the way the
572 PDB header (or the absence of a such) is handled. The implementation
573 must rely on reading character data from the current internal
574 self._stream and must return a new L{csb.bio.structure.Structure}
575 instance with properly initialized header data: accession, model,
576 molecule identifiers, chains and residues. This structure object is
577 then internally passed to the C{_parse_atoms} hook, responsible for
578 attachment of the atoms to the residues in the structure.
579 """
580 pass
581
597
599 """
600 Parse the ATOMs from the specified C{model} and attach them to the
601 C{structure}.
602 """
603
604 structure.model_id = None
605
606 atoms = dict( (chain, []) for chain in structure.chains )
607 chains = set()
608 total_chains = len([c for c in structure.items if c.length > 0])
609 het_residues = dict( (chain, set()) for chain in structure.chains )
610 in_ligands = False
611 in_atom = False
612
613 self._stream.seek(0)
614 while True:
615
616 try:
617 line = next(self._stream)
618 except StopIteration:
619 break
620
621 if line.startswith('HET '):
622 het_residue, het_chain, het_residue_id = line[7:10].strip(), line[12], line[13:18].strip()
623
624 if het_chain in structure:
625 chain = structure.chains[het_chain]
626 if chain.type == SequenceTypes.Protein and het_residue in PDB_AMINOACIDS:
627 het_residues[het_chain].add(het_residue_id)
628 elif chain.type == SequenceTypes.NucleicAcid and het_residue in PDB_NUCLEOTIDES:
629 het_residues[het_chain].add(het_residue_id)
630
631 elif line.startswith('MODEL'):
632 if model and model != int(line[10:14]):
633 self._scroll_model(model, self._stream)
634 structure.model_id = model
635 else:
636 model = int(line[10:14])
637 structure.model_id = model
638
639 elif line.startswith('ATOM') \
640 or (line.startswith('HETATM') and not in_ligands):
641 in_atom = True
642
643 rank = int(line[22:26])
644 serial_number = int(line[6:11])
645 name = line[12:16]
646 x, y, z = line[30:38], line[38:46], line[46:54]
647 vector = numpy.array([float(x), float(y), float(z)])
648 element = line[76:78].strip()
649 if element:
650 try:
651 element = csb.core.Enum.parsename(ChemElements, element)
652 except csb.core.EnumMemberError:
653 if element in ('D', 'X'):
654 element = ChemElements.x
655 else:
656 raise StructureFormatError('Unknown chemical element: {0}'.format(element))
657 else:
658 element = None
659
660 atom = csb.bio.structure.Atom(serial_number, name, element,
661 vector)
662
663 atom._het = line.startswith('HETATM')
664 atom._rank = rank
665 atom._sequence_number = int(line[22:26].strip())
666 atom._residue_id = str(atom._sequence_number)
667 atom._insertion_code = line[26].strip()
668 if not atom._insertion_code:
669 atom._insertion_code = None
670 else:
671 atom._residue_id += atom._insertion_code
672
673 atom.alternate = line[16].strip()
674 if not atom.alternate:
675 atom.alternate = None
676
677 atom._chain = line[21].strip()
678 if atom._chain not in structure.chains:
679 raise StructureFormatError(
680 'Atom {0}: chain {1} is undefined'.format(serial_number, atom._chain))
681 chains.add(atom._chain)
682 residue_name = line[17:20].strip()
683 residue_name = self.parse_residue_safe(residue_name, as_type=structure.chains[atom._chain].type)
684 if structure.chains[atom._chain].type == SequenceTypes.NucleicAcid:
685 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Nucleic, residue_name)
686 else:
687 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Protein, residue_name)
688
689 atom.occupancy = float(line[54:60])
690 atom.temperature = float(line[60:66])
691
692 charge = line[78:80].strip()
693 if charge:
694 if charge in ('+', '-'):
695 charge += '1'
696 if charge[-1] in ('+', '-'):
697 charge = charge[::-1]
698 atom.charge = int(charge)
699
700 atoms[atom._chain].append(atom)
701
702 elif in_atom and line.startswith('TER'):
703 in_atom = False
704 if len(chains) == total_chains:
705 in_ligands = True
706
707 elif line.startswith('ENDMDL'):
708 break
709
710 elif line.startswith('END'):
711 break
712
713 if structure.model_id != model:
714 raise ValueError('No such model {0} in the structure.'.format(model))
715
716 self._map_residues(structure, atoms, het_residues)
717
719
720 assert set(structure.chains) == set(atoms.keys())
721
722 for chain in structure.chains:
723
724 subject = structure.chains[chain].sequence
725 query = ''
726 fragments = []
727
728 seq_numbers = []
729 lookup = {}
730
731 i = -1
732 for a in atoms[chain]:
733 if a._residue_id not in lookup:
734 i += 1
735 lookup[a._residue_id] = [a._sequence_number, a._insertion_code]
736 seq_numbers.append(a._residue_id)
737 res_name = a._residue_name.value
738 res_id = '{0}{1}'.format(a._sequence_number or '', a._insertion_code or '').strip()
739 if a._het and res_id not in het_residues[chain]:
740
741 fragments.append([res_name, '?'])
742 elif a._insertion_code and not (i > 0 and lookup[seq_numbers[i - 1]][1]):
743 fragments.append([res_name])
744 elif i == 0 or a._sequence_number - lookup[seq_numbers[i - 1]][0] not in (0, 1, -1):
745
746 fragments.append([res_name])
747 else:
748
749 if fragments[-1][-1] == '?':
750
751
752 fragments.append([res_name])
753 else:
754
755 fragments[-1].append(res_name)
756
757 for i, frag in enumerate(fragments):
758 fragments[i] = ''.join(frag)
759 if len(fragments) > 100:
760
761 raise StructureFormatError("Can't map structure with more than 100 fragments in ATOMS")
762 query = '^.*?({0}).*?$'.format(').*?('.join(fragments))
763
764 matches = re.match(query, subject)
765
766 if matches:
767 seq_numitem = -1
768 for frag_number, frag in enumerate(matches.groups(), start=1):
769 if frag is '':
770
771
772 seq_numitem += 1
773 else:
774 for i, dummy in enumerate(frag, start=1):
775 seq_numitem += 1
776
777 try:
778 lookup[ seq_numbers[seq_numitem] ] = matches.start(frag_number) + i
779 except:
780 raise
781
782 fixed_residue = None
783 for atom in atoms[chain]:
784 if not isinstance(lookup[atom._residue_id], int):
785 continue
786 atom._rank = lookup[atom._residue_id]
787 residue = structure.chains[chain].residues[atom._rank]
788 if residue is not fixed_residue:
789 residue.id = atom._sequence_number, atom._insertion_code
790 fixed_residue = residue
791
792 assert str(residue.type) == subject[atom._rank - 1]
793 residue.atoms.append(atom)
794
795 del atom._rank
796 del atom._insertion_code
797 del atom._sequence_number
798 del atom._chain
799 del atom._residue_id
800 del atom._residue_name
801 else:
802 if structure.chains[chain].length == 0 and len(atoms[chain]) > 0:
803 raise StructureFormatError("Can't add atoms: chain {0} has no residues".format(chain))
804 else:
805 raise StructureFormatError("Can't map atoms")
806
808 """
809 Parse and attach secondary structure data.
810
811 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated
812 as a regular SecStructures.Helix. This is due to incompatibility
813 between DSSP and PDB helix types.
814 @todo: Implement a proper workaround for the previous bug (e.g. skip all
815 helices types not included in the DSSP enum)
816
817 @warning: In this implementation only the start/end positions of the SS
818 elements are parsed. Additional data like H-bonding is ignored.
819
820 @bug: Currently structure.to_pdb() is not writing any SS data.
821 """
822 elements = {}
823 self._stream.seek(0)
824
825 while True:
826 try:
827 line = next(self._stream)
828 except StopIteration:
829 break
830
831 if line.startswith('HELIX'):
832
833 chain = structure.chains[line[19].strip()]
834 if chain.id not in elements:
835 elements[chain.id] = []
836 if chain.id != line[31].strip():
837 if self._check_ss:
838 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10]))
839 else:
840 continue
841 try:
842 startres = chain.find(line[21:25].strip(), line[25].strip())
843 endres = chain.find(line[33:37].strip(), line[37].strip())
844 except csb.core.ItemNotFoundError as ex:
845 if self._check_ss:
846 raise SecStructureFormatError(
847 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
848 else:
849 continue
850 if not startres.rank <= endres.rank:
851 if self._check_ss:
852 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10]))
853 else:
854 continue
855 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix)
856 elements[chain.id].append(helix)
857
858 if line.startswith('SHEET'):
859
860 chain = structure.chains[line[21].strip()]
861 if chain.id not in elements:
862 elements[chain.id] = []
863 if chain.id != line[32].strip():
864 if self._check_ss:
865 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10]))
866 else:
867 continue
868 try:
869 startres = chain.find(line[22:26].strip(), line[26].strip())
870 endres = chain.find(line[33:37].strip(), line[37].strip())
871 except csb.core.ItemNotFoundError as ex:
872 if self._check_ss:
873 raise SecStructureFormatError(
874 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
875 else:
876 continue
877 if not startres.rank <= endres.rank:
878 if self._check_ss:
879 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10]))
880 else:
881 continue
882 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand)
883 elements[chain.id].append(strand)
884
885 elif line.startswith('MODEL') or line.startswith('ATOM'):
886 break
887
888 for chain_id in elements:
889 ss = csb.bio.structure.SecondaryStructure()
890 for e in elements[chain_id]:
891 ss.append(e)
892 structure.chains[chain_id].secondary_structure = ss
893
895 """
896 Read REMARK lines from PDB file.
897
898 @return: dictionary with remark numbers as keys, and lists of lines as values.
899 @rtype: dict
900 """
901 self._stream.seek(0)
902
903 remarks = {}
904
905 for line in self._stream:
906 if line.startswith('REMARK'):
907 num = int(line[7:10])
908 lstring = line[11:]
909 remarks.setdefault(num, []).append(lstring)
910 elif line.startswith('DBREF') or line.startswith('ATOM'):
911 break
912
913 return remarks
914
917 """
918 This is the de facto PDB parser, which is designed to read SEQRES and ATOM
919 sections separately, and them map them. Intentionally fails to parse
920 malformed PDB files, e.g. a PDB file without a HEADER section.
921 """
922
924 """
925 Parse the HEADER section of a regular PDB file.
926
927 @return: a L{csb.bio.structure.Structure} instance with properly
928 initialized residues from the SEQRES.
929 @rtype: L{csb.bio.structure.Structure}
930
931 @raise PDBParseError: if the stream has no HEADER at byte 0
932 """
933
934 self._stream.seek(0)
935
936 header = next(self._stream)
937 if not header.startswith('HEADER'):
938 raise PDBParseError('Does not look like a regular PDB file.')
939
940 structure = csb.bio.structure.Structure(header.split()[-1])
941
942 while True:
943
944 try:
945 line = next(self._stream)
946 except StopIteration:
947 break
948
949 if line.startswith('COMPND'):
950 if line[10:].lstrip().startswith('MOL_ID:'):
951 mol_id = int(line[18:].replace(';', '').strip())
952 chain_name = ''
953 chains = ''
954 while line.startswith('COMPND'):
955 line = next(self._stream)
956 if line.split()[2].startswith('MOLECULE:'):
957 chain_name += line[20:].strip()
958 while not chain_name.endswith(';'):
959 line = next(self._stream)
960 if not line.startswith('COMPND'):
961 break
962 chain_name += ' ' + line[11:].strip()
963 else:
964 while not line.split()[2].startswith('CHAIN:'):
965 line = next(self._stream)
966 if not line.startswith('COMPND'):
967 raise HeaderFormatError('Missing chain identifier in COMPND section')
968 chains = line[17:].strip()
969 while not chains.endswith(';'):
970 line = next(self._stream)
971 if not line.startswith('COMPND'):
972 break
973 chains += ', ' + line[11:].strip()
974 break
975
976 chain_name = chain_name.strip()[:-1]
977 for chain in chains.replace(';', ' ').replace(',', ' ').split() or ['']:
978 new_chain = csb.bio.structure.Chain(chain, type=SequenceTypes.Unknown,
979 name=chain_name, accession=structure.accession)
980 new_chain.molecule_id = mol_id
981 try:
982 structure.chains.append(new_chain)
983 except csb.core.DuplicateKeyError:
984 raise HeaderFormatError('Chain {0} is already defined.'.format(new_chain.id))
985
986 elif line.startswith('SEQRES'):
987
988 seq_fields = [line[7:10], line[11], line[13:17] ]
989 seq_fields.extend(line[18:].split())
990
991 rank_base = int(seq_fields[0])
992 chain_id = seq_fields[1].strip()
993
994 if chain_id not in structure.chains:
995 raise HeaderFormatError('Chain {0} is undefined'.format(chain_id))
996
997 chain = structure.chains[chain_id]
998
999 if chain.type == SequenceTypes.Unknown:
1000 inner_residuerank = int(len(seq_fields[3:]) / 2) + 3
1001 for i in [inner_residuerank, 3, -1]:
1002 try:
1003 chain.type = self.guess_sequence_type(seq_fields[i])
1004 break
1005 except UnknownPDBResidueError:
1006 pass
1007
1008 for i, residue_name in enumerate(seq_fields[3:]):
1009 rank = rank_base * 13 - (13 - (i + 1))
1010 rname = self.parse_residue_safe(residue_name, as_type=chain.type)
1011 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname)
1012 residue._pdb_name = residue_name
1013 structure.chains[chain_id].residues.append(residue)
1014 assert structure.chains[chain_id].residues.last_index == rank
1015
1016 elif line.startswith('MODEL') or line.startswith('ATOM'):
1017 break
1018
1019 return structure
1020
1023 """
1024 Ultra fast PDB HEADER parser. Does not read any structural data.
1025 """
1026
1029
1032
1035
1038 """
1039 This is a customized PDB parser, which is designed to read both sequence and
1040 atom data from the ATOM section. This is especially useful when parsing PDB
1041 files without a header.
1042 """
1043
1138
1139
1140 StructureParser = AbstractStructureParser.create_parser
1141 """
1142 Alias for L{AbstractStructureParser.create_parser}.
1143 """
1146 """
1147 Base abstract files for all structure file formatters.
1148 Defines a common step-wise interface according to the Builder pattern.
1149
1150 @param output: output stream (this is where the product is constructed)
1151 @type output: stream
1152 """
1153
1154 __metaclass__ = ABCMeta
1155
1157
1158 if not hasattr(output, 'write'):
1159 raise TypeError(output)
1160
1161 def isnull(this, that, null=None):
1162 if this is null:
1163 return that
1164 else:
1165 return this
1166
1167 self._out = output
1168 self._isnull = isnull
1169
1170 @property
1172 """
1173 Destination stream
1174 @rtype: stream
1175 """
1176 return self._out
1177
1178 @property
1180 """
1181 ISNULL(X, Y) function
1182 @rtype: callable
1183 """
1184 return self._isnull
1185
1187 """
1188 Write a chunk of text
1189 """
1190 self._out.write(text)
1191
1193 """
1194 Write a chunk of text and append a new line terminator
1195 """
1196 self._out.write(text)
1197 self._out.write('\n')
1198
1199 @abstractmethod
1202
1203 @abstractmethod
1206
1209
1211 """
1212 PDB file format builder.
1213 """
1214
1217
1219 """
1220 Write the HEADER of the file using C{master}
1221
1222 @type master: L{Structure}
1223 """
1224
1225 isnull = self.isnull
1226
1227 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}'
1228 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper()))
1229
1230 molecules = { }
1231
1232 for chain_id in master.chains:
1233 chain = master.chains[chain_id]
1234 if chain.molecule_id not in molecules:
1235 molecules[chain.molecule_id] = [ ]
1236 molecules[chain.molecule_id].append(chain_id)
1237
1238 k = 0
1239 for mol_id in sorted(molecules):
1240
1241 chains = molecules[mol_id]
1242 first_chain = master.chains[ chains[0] ]
1243
1244 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0')))
1245 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, '')))
1246 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains)))
1247 k += 3
1248
1249 for chain_id in master.chains:
1250
1251 chain = master.chains[chain_id]
1252 res = [ r._pdb_name for r in chain.residues ]
1253
1254 rn = 0
1255 for j in range(0, chain.length, 13):
1256 rn += 1
1257 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ]
1258 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format(
1259 rn, chain.id, chain.length, ' '.join(residues) ))
1260
1262 """
1263 Append a new model to the file
1264
1265 @type structure: L{Structure}
1266 """
1267
1268 isnull = self.isnull
1269
1270 for chain_id in structure.chains:
1271
1272 chain = structure.chains[chain_id]
1273 for residue in chain.residues:
1274
1275 atoms = [ ]
1276 for an in residue.atoms:
1277 atom = residue.atoms[an]
1278 if isinstance(atom, csb.bio.structure.DisorderedAtom):
1279 for dis_atom in atom: atoms.append(dis_atom)
1280 else:
1281 atoms.append(atom)
1282 atoms.sort()
1283
1284 for atom in atoms:
1285
1286 alt = atom.alternate
1287 if alt is True:
1288 alt = 'A'
1289 elif alt is False:
1290 alt = ' '
1291
1292 if atom.element:
1293 element = repr(atom.element)
1294 else:
1295 element = ' '
1296 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format(
1297 atom.serial_number, atom._full_name, isnull(alt, ' '),
1298 residue._pdb_name, chain.id,
1299 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '),
1300 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.temperature, 0.0),
1301 element, isnull(atom.charge, ' ') ))
1302
1303 self.writeline('TER')
1304
1306 """
1307 Add the END marker
1308 """
1309 self.writeline('END')
1310 self._out.flush()
1311
1313 """
1314 Supports serialization of NMR ensembles.
1315
1316 Functions as a simple decorator, which wraps C{add_structure} with
1317 MODEL/ENDMDL records.
1318 """
1319
1327
1330 """
1331 Base class for all PDB data source providers.
1332
1333 Concrete classes need to implement the C{find} method, which abstracts the
1334 retrieval of a PDB structure file by a structure identifier. This is a hook
1335 method called internally by C{get}, but subclasses can safely override both
1336 C{find} and {get} to in order to achieve completely custom behavior.
1337 """
1338
1339 __metaclass__ = ABCMeta
1340
1343
1344 @abstractmethod
1345 - def find(self, id):
1346 """
1347 Attempt to discover a PDB file, given a specific PDB C{id}.
1348
1349 @param id: structure identifier (e.g. 1x80)
1350 @type id: str
1351 @return: path and file name on success, None otherwise
1352 @rtype: str or None
1353 """
1354 pass
1355
1356 - def get(self, id, model=None):
1357 """
1358 Discover, parse and return the PDB structure, corresponding to the
1359 specified C{id}.
1360
1361 @param id: structure identifier (e.g. 1x80)
1362 @type id: str
1363 @param model: optional model identifier
1364 @type model: str
1365
1366 @rtype: L{csb.bio.Structure}
1367
1368 @raise StructureNotFoundError: when C{id} could not be found
1369 """
1370 pdb = self.find(id)
1371
1372 if pdb is None:
1373 raise StructureNotFoundError(id)
1374 else:
1375 return StructureParser(pdb).parse_structure(model=model)
1376
1378 """
1379 Simple file system based PDB data source. Scans a list of local directories
1380 using pre-defined file name templates.
1381
1382 @param paths: a list of paths
1383 @type paths: iterable or str
1384 """
1385
1387
1388 self._templates = ['pdb{id}.ent', 'pdb{id}.pdb', '{id}.pdb', '{id}.ent']
1389 self._paths = csb.core.OrderedDict()
1390
1391 if paths is not None:
1392 if isinstance(paths, csb.core.string):
1393 paths = [paths]
1394
1395 for path in paths:
1396 self.add(path)
1397
1398 @property
1400 """
1401 Current search paths
1402 @rtype: tuple
1403 """
1404 return tuple(self._paths)
1405
1406 @property
1408 """
1409 Current file name match templates
1410 @rtype: tuple
1411 """
1412 return tuple(self._templates)
1413
1414 - def add(self, path):
1415 """
1416 Register a new local C{path}.
1417
1418 @param path: directory name
1419 @type path: str
1420
1421 @raise IOError: if C{path} is not a valid directory
1422 """
1423 if os.path.isdir(path):
1424 self._paths[path] = path
1425 else:
1426 raise IOError(path)
1427
1429 """
1430 Register a custom file name name C{template}. The template must contain
1431 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent
1432
1433 @param template: pattern
1434 @type template: str
1435 """
1436
1437 if '{id}' not in template:
1438 raise ValueError('Template does not contain an "{id}" macro')
1439
1440 if template not in self._templates:
1441 self._templates.append(template)
1442
1444 """
1445 Unregister an existing local C{path}.
1446
1447 @param path: directory name
1448 @type path: str
1449
1450 @raise ValueError: if C{path} had not been registered
1451 """
1452 if path not in self._paths:
1453 raise ValueError('path not found: {0}'.format(path))
1454
1455 del self._paths[path]
1456
1457 - def find(self, id):
1458
1459 for path in self._paths:
1460 for token in self.templates:
1461 fn = os.path.join(path, token.format(id=id))
1462 if os.path.exists(fn):
1463 return fn
1464
1465 return None
1466
1468 """
1469 Retrieves PDB structures from a specified remote URL.
1470 The URL requested from remote server takes the form: <prefix>/<ID><suffix>
1471
1472 @param prefix: URL prefix, including protocol
1473 @type prefix: str
1474 @param suffix: optional URL suffix (.ent by default)
1475 @type suffix: str
1476 """
1477
1478 - def __init__(self, prefix='http://www.rcsb.org/pdb/files/pdb', suffix='.ent'):
1485
1486 @property
1488 """
1489 Current URL prefix
1490 @rtype: str
1491 """
1492 return self._prefix
1493 @prefix.setter
1495 self._prefix = value
1496
1497 @property
1499 """
1500 Current URL suffix
1501 @rtype: str
1502 """
1503 return self._suffix
1504 @suffix.setter
1506 self._suffix = value
1507
1514
1515 - def find(self, id):
1529
1530 - def get(self, id, model=None):
1541
1543 """
1544 A custom PDB data source. Functions as a user-defined map of structure
1545 identifiers and their corresponding local file names.
1546
1547 @param files: initialization dictionary of id:file pairs
1548 @type files: dict-like
1549 """
1550
1552
1553 self._files = {}
1554 for id in files:
1555 self.add(id, files[id])
1556
1557 @property
1559 """
1560 List of currently registered file names
1561 @rtype: tuple
1562 """
1563 return tuple(self._files.values())
1564
1565 @property
1567 """
1568 List of currently registered structure identifiers
1569 @rtype: tuple
1570 """
1571 return tuple(self._files)
1572
1573 - def add(self, id, path):
1574 """
1575 Register a new local C{id}:C{path} pair.
1576
1577 @param id: structure identifier
1578 @type id: str
1579 @param path: path and file name
1580 @type path: str
1581
1582 @raise IOError: if C{path} is not a valid file name
1583 """
1584 if os.path.isfile(path):
1585 self._files[id] = path
1586 else:
1587 raise IOError(path)
1588
1590 """
1591 Unregister an existing structure C{id}.
1592
1593 @param id: structure identifier
1594 @type id: str
1595
1596 @raise ValueError: if C{id} had not been registered
1597 """
1598 if id not in self._files:
1599 raise ValueError(id)
1600 else:
1601 del self._files[id]
1602
1603 - def find(self, id):
1604
1605 if id in self._files:
1606 return self._files[id]
1607 else:
1608 return None
1609
1610 -def get(accession, model=None, prefix='http://www.rcsb.org/pdb/files/pdb'):
1611 """
1612 Download and parse a PDB entry.
1613
1614 @param accession: accession number of the entry
1615 @type accession: str
1616 @param model: model identifier
1617 @type model: str
1618 @param prefix: download URL prefix
1619 @type prefix: str
1620
1621 @return: object representation of the selected model
1622 @rtype: L{Structure}
1623 """
1624 return RemoteStructureProvider(prefix).get(accession, model=model)
1625
1626 -def find(id, paths):
1627 """
1628 Try to discover a PDB file for PDB C{id} in C{paths}.
1629
1630 @param id: PDB ID of the entry
1631 @type id: str
1632 @param paths: a list of directories to scan
1633 @type paths: list of str
1634
1635 @return: path and file name on success, None otherwise
1636 @rtype: str
1637 """
1638 return FileSystemStructureProvider(paths).find(id)
1639
1642
1643 - def __init__(self, result, exception):
1644
1645 self.result = result
1646 self.exception = exception
1647
1649 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
1650
1655
1658 """
1659 Wraps StructureParser in an asynchronous call. Since a new process is
1660 started by Python internally (as opposed to only starting a new thread),
1661 this makes the parser slower, but provides a way to set a parse timeout
1662 limit.
1663
1664 If initialized with more than one worker, supports parallel parsing
1665 through the C{self.parse_async} method.
1666
1667 @param workers: number of worker threads (1 by default)
1668 @type workers: int
1669 """
1670
1672
1673 self._pool = None
1674 self._workers = 1
1675
1676 if int(workers) > 0:
1677 self._workers = int(workers)
1678 else:
1679 raise ValueError(workers)
1680
1681 self._recycle()
1682
1684
1685 if self._pool:
1686 self._pool.terminate()
1687
1688 self._pool = multiprocessing.Pool(processes=self._workers)
1689
1692 """
1693 Call StructureParser.parse_structure() in a separate process and return
1694 the output. Raise TimeoutError if the parser does not respond within
1695 C{timeout} seconds.
1696
1697 @param structure_file: structure file to parse
1698 @type structure_file: str
1699 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
1700 elapse before the parser completes its job
1701 @type timeout: int
1702 @param parser: any implementing L{AbstractStructureParser} class
1703 @type parser: type
1704
1705 @return: parsed structure
1706 @rtype: L{csb.structure.Structure}
1707 """
1708
1709 r = self.parse_async([structure_file], timeout, model, parser)
1710 if len(r) > 0:
1711 if r[0].exception is not None:
1712 raise r[0].exception
1713 else:
1714 return r[0].result
1715 return None
1716
1719 """
1720 Call C{self.parse_structure} for a list of structure files
1721 simultaneously. The actual degree of parallelism will depend on the
1722 number of workers specified while constructing the parser object.
1723
1724 @note: Don't be tempted to pass a large list of structures to this
1725 method. Every time a C{TimeoutError} is encountered, the
1726 corresponding worker process in the pool will hang until the
1727 process terminates on its own. During that time, this worker is
1728 unusable. If a sufficiently high number of timeouts occur, the
1729 whole pool of workers will be unsable. At the end of the method
1730 however a pool cleanup is performed and any unusable workers
1731 are 'reactivated'. However, that only happens at B{the end} of
1732 C{parse_async}.
1733
1734 @param structure_files: a list of structure files
1735 @type structure_files: tuple of str
1736 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
1737 elapse before the parser completes its job
1738 @type timeout: int
1739 @param parser: any implementing L{AbstractStructureParser} class
1740 @type parser: type
1741
1742 @return: a list of L{AsyncParseResult} objects
1743 @rtype: list
1744 """
1745
1746 pool = self._pool
1747 workers = []
1748 results = []
1749
1750 for file in list(structure_files):
1751 result = pool.apply_async(_parse_async, [parser, file, model])
1752 workers.append(result)
1753
1754 hanging = False
1755 for w in workers:
1756 result = AsyncParseResult(None, None)
1757 try:
1758 result.result = w.get(timeout=timeout)
1759 except KeyboardInterrupt as ki:
1760 pool.terminate()
1761 raise ki
1762 except Exception as ex:
1763 result.exception = ex
1764 if isinstance(ex, multiprocessing.TimeoutError):
1765 hanging = True
1766 results.append(result)
1767
1768 if hanging:
1769 self._recycle()
1770
1771 return results
1772