Package csb :: Package bio :: Package io :: Module wwpdb
[frames] | no frames]

Source Code for Module csb.bio.io.wwpdb

   1  """ 
   2  PDB structure parsers, format builders and database providers. 
   3   
   4  The most basic usage is: 
   5   
   6      >>> parser = StructureParser('structure.pdb') 
   7      >>> parser.parse_structure() 
   8      <Structure>     # a Structure object (model) 
   9   
  10  or if this is an NMR ensemble: 
  11       
  12      >>> parser.parse_models() 
  13      <Ensemble>      # an Ensemble object (collection of alternative Structure-s) 
  14       
  15  This module introduces a family of PDB file parsers. The common interface of all 
  16  parsers is defined in L{AbstractStructureParser}. This class has several 
  17  implementations: 
  18   
  19      - L{RegularStructureParser} - handles normal PDB files with SEQRES fields 
  20      - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files, 
  21        which are lacking SEQRES records (initializes all residues from the ATOMs instead) 
  22      - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures 
  23        without coordinates. Useful for reading metadata (e.g. accession numbers or just 
  24        plain SEQRES sequences) with minimum overhead 
  25         
  26  Unless you have a special reason, you should use the L{StructureParser} factory, 
  27  which returns a proper L{AbstractStructureParser} implementation, depending on the 
  28  input PDB file. If the input file looks like a regular PDB file, the factory 
  29  returns a L{RegularStructureParser}, otherwise it instantiates L{RegularStructureParser}. 
  30  L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}. 
  31   
  32  Another important abstraction in this module is L{StructureProvider}. It has several 
  33  implementations which can be used to retrieve PDB L{Structure}s from various sources: 
  34  file system directories, remote URLs, etc. You can easily create your own provider as 
  35  well. See L{StructureProvider} for details. 
  36   
  37  Finally, this module gives you some L{FileBuilder}s, used for text serialization  
  38  of L{Structure}s and L{Ensemble}s: 
  39   
  40      >>> builder = PDBFileBuilder(stream) 
  41      >>> builder.add_header(structure) 
  42      >>> builder.add_structure(structure) 
  43   
  44  where stream is any Python stream, e.g. an open file or sys.stdout. 
  45   
  46  See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these 
  47  objects. 
  48  """ 
  49   
  50  import re 
  51  import os 
  52  import numpy 
  53  import datetime 
  54  import multiprocessing 
  55   
  56  import csb.bio.structure 
  57  import csb.core 
  58  import csb.io 
  59   
  60  from abc import ABCMeta, abstractmethod 
  61  from csb.bio.sequence import SequenceTypes, SequenceAlphabets 
  62  from csb.bio.structure import ChemElements, SecStructures 
  63   
  64   
  65  PDB_AMINOACIDS = { 
  66      'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN', 
  67      'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY', 
  68      'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS', 
  69      'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS', 
  70      'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER', 
  71      'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN', 
  72      'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU', 
  73      'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER', 
  74      'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG', 
  75      'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU', 
  76      'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER', 
  77      'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU', 
  78      'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR', 
  79      'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA', 
  80      'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET', 
  81      'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY', 
  82      'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE', 
  83      'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS', 
  84      'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR', 
  85      '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL', 
  86      'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS', 
  87      'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS', 
  88      'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP', 
  89      'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR', 
  90      'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU', 
  91      'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS', 
  92      'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY', 
  93      'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS', 
  94      'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS', 
  95      'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS', 
  96      'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS', 
  97      'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG', 
  98      'DNP': 'ALA' 
  99      } 
 100  """ 
 101  Dictionary of non-standard amino acids, which could be found in PDB. 
 102  """ 
 103   
 104   
 105  PDB_NUCLEOTIDES = { 
 106      'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine', 
 107       'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine', 
 108       'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine', 
 109       'K': 'Ketone', '  M': 'Amino', 'S': 'Strong', 'W': 'Weak', 
 110       'B': 'NotA', 'D'  : 'NotC', 'H': 'NotG', 'V': 'NotT', 
 111       'N': 'Any', 'X'  : 'Masked' 
 112      } 
 113  """ 
 114  Dictionary of non-standard nucleotides, which could be found in PDB. 
 115  """ 
116 117 -class PDBParseError(ValueError):
118 pass
119
120 -class HeaderFormatError(PDBParseError):
121 pass
122
123 -class SecStructureFormatError(PDBParseError):
124 pass
125
126 -class StructureFormatError(PDBParseError):
127 pass
128
129 -class UnknownPDBResidueError(PDBParseError):
130 pass
131
132 -class StructureNotFoundError(KeyError):
133 pass
134
135 -class InvalidEntryIDError(StructureFormatError):
136 pass
137
138 139 -class EntryID(object):
140 """ 141 Represents a PDB Chain identifier. Implementing classes must define 142 how the original ID is split into accession number and chain ID. 143 144 @param id: identifier 145 @type id: str 146 """ 147 __metaclass__ = ABCMeta 148
149 - def __init__(self, id):
150 151 self._accession = '' 152 self._chain = '' 153 154 id = id.strip() 155 self._accession, self._chain = self.parse(id)
156 157 @staticmethod
158 - def create(id):
159 """ 160 Guess the format of C{id} and parse it. 161 162 @return: a new PDB ID of the appropriate type 163 @rtype: L{EntryID} 164 """ 165 166 if len(id) in (4, 5): 167 return StandardID(id) 168 elif len(id) == 6 and id[4] == '_': 169 return SeqResID(id) 170 else: 171 return DegenerateID(id)
172 173 @abstractmethod
174 - def parse(self, id):
175 """ 176 Split C{id} into accession number and chain ID. 177 178 @param id: PDB identifier 179 @type id: str 180 @return: (accession, chain) 181 @rtype: tuple of str 182 183 @raise InvalidEntryIDError: when C{id} is in an inappropriate format 184 """ 185 pass
186
187 - def format(self):
188 """ 189 @return: the identifier in its original format 190 @rtype: str 191 """ 192 return self.entry_id
193 194 @property
195 - def accession(self):
196 """ 197 Accession number part of the Entry ID 198 @rtype: str 199 """ 200 return self._accession
201 202 @property
203 - def chain(self):
204 """ 205 Chain ID part of the Entry ID 206 @rtype: str 207 """ 208 return self._chain
209 210 @property
211 - def entry_id(self):
212 """ 213 Accession number + Chain ID 214 @rtype: str 215 """ 216 return "{0.accession}{0.chain}".format(self)
217
218 - def __str__(self):
219 return self.entry_id
220
221 -class StandardID(EntryID):
222 """ 223 Standard PDB ID in the following form: xxxxY, where xxxx is the accession 224 number (lower case) and Y is an optional chain identifier. 225 """
226 - def parse(self, id):
227 228 if len(id) not in (4, 5): 229 raise InvalidEntryIDError(id) 230 231 return (id[:4].lower(), id[4:])
232
233 -class DegenerateID(EntryID):
234 """ 235 Looks like a L{StandardID}, except that the accession number may have 236 arbitrary length. 237 """
238 - def parse(self, id):
239 240 if len(id) < 2: 241 raise InvalidEntryIDError(id) 242 243 return (id[:-1].lower(), id[-1])
244
245 -class SeqResID(EntryID):
246 """ 247 Same as a L{StandardID}, but contains an additional underscore between 248 te accession number and the chain identifier. 249 """
250 - def parse(self, id):
251 252 if not (len(id) == 6 and id[4] == '_'): 253 raise InvalidEntryIDError(id) 254 255 return (id[:4].lower(), id[5:])
256
257 - def format(self):
258 return "{0.accession}_{0.chain}".format(self)
259
260 261 -class AbstractStructureParser(object):
262 """ 263 A base PDB structure format-aware parser. Subclasses must implement the 264 internal abstract method C{_parse_header} in order to complete the 265 implementation. 266 267 @param structure_file: the input PD file to parse 268 @type structure_file: str 269 @param check_ss: if True, secondary structure errors in the file will cause 270 L{SecStructureFormatError} exceptions 271 @type check_ss: bool 272 273 @raise IOError: when the input file cannot be found 274 """ 275 276 __metaclass__ = ABCMeta 277 278 @staticmethod
279 - def create_parser(structure_file, check_ss=False):
280 """ 281 A StructureParser factory, which instantiates and returns the proper parser 282 object based on the contents of the PDB file. 283 284 If the file contains a SEQRES section, L{RegularStructureParser} is returned, 285 otherwise L{LegacyStructureParser} is instantiated. In the latter case 286 LegacyStructureParser will read the sequence data directly from the ATOMs. 287 288 @param structure_file: the PDB file to parse 289 @type structure_file: str 290 291 @rtype: L{AbstractStructureParser} 292 """ 293 has_seqres = False 294 295 for line in open(structure_file): 296 if line.startswith('SEQRES'): 297 has_seqres = True 298 break 299 300 if has_seqres: 301 return RegularStructureParser(structure_file) 302 else: 303 return LegacyStructureParser(structure_file)
304
305 - def __init__(self, structure_file, check_ss=False):
306 307 self._file = None 308 self._stream = None 309 self._check_ss = bool(check_ss) 310 311 self.filename = structure_file
312
313 - def __del__(self):
314 try: 315 self._stream.close() 316 except: 317 pass
318 319 @property
320 - def filename(self):
321 """ 322 Current input PDB file name 323 @rtype: str 324 """ 325 return self._file
326 @filename.setter
327 - def filename(self, name):
328 try: 329 stream = open(name) 330 except IOError: 331 raise IOError('File not found: {0}'.format(name)) 332 333 if self._stream: 334 try: 335 self._stream.close() 336 except: 337 pass 338 self._stream = stream 339 self._file = name
340
341 - def models(self):
342 """ 343 Find all available model identifiers in the structure. 344 345 @return: a list of model IDs 346 @rtype: list 347 """ 348 models = [] 349 check = set() 350 351 with open(self._file, 'r') as f: 352 for line in f: 353 if line.startswith('MODEL'): 354 model_id = int(line[10:14]) 355 if model_id in check: 356 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id)) 357 models.append(model_id) 358 check.add(model_id) 359 360 if len(models) > 0: 361 if not(min(check) == 1 and max(check) == len(models)): 362 raise StructureFormatError('Non-consecutive model identifier(s) encountered') 363 return models 364 else: 365 return []
366
367 - def guess_sequence_type(self, residue_name):
368 """ 369 Try to guess what is the sequence type of a PDB C{residue_name}. 370 371 @param residue_name: a PDB-conforming name of a residue 372 @type residue_name: str 373 374 @return: a L{SequenceTypes} enum member 375 @rtype: L{csb.core.EnumItem} 376 377 @raise UnknownPDBResidueError: when there is no such PDB residue name 378 in the catalog tables 379 """ 380 if residue_name in PDB_AMINOACIDS: 381 return SequenceTypes.Protein 382 elif residue_name in PDB_NUCLEOTIDES: 383 return SequenceTypes.NucleicAcid 384 else: 385 raise UnknownPDBResidueError(residue_name)
386
387 - def parse_residue(self, residue_name, as_type=None):
388 """ 389 Try to parse a PDB C{residue_name} and return its closest 'normal' 390 string representation. If a sequence type (C{as_type}) is defined, 391 guess the alphabet based on that information, otherwise try first to 392 parse it as a protein residue. 393 394 @param residue_name: a PDB-conforming name of a residue 395 @type residue_name: str 396 @param as_type: suggest a sequence type (L{SequenceTypes} member) 397 @type L{scb.core.EnumItem} 398 399 @return: a normalized residue name 400 @rtype: str 401 402 @raise UnknownPDBResidueError: when there is no such PDB residue name 403 in the catalog table(s) 404 """ 405 if as_type is None: 406 as_type = self.guess_sequence_type(residue_name) 407 408 try: 409 if as_type == SequenceTypes.Protein: 410 return PDB_AMINOACIDS[residue_name] 411 elif as_type == SequenceTypes.NucleicAcid: 412 return PDB_NUCLEOTIDES[residue_name] 413 else: 414 raise UnknownPDBResidueError(residue_name) 415 except KeyError: 416 raise UnknownPDBResidueError(residue_name)
417
418 - def parse_residue_safe(self, residue_name, as_type):
419 """ 420 Same as C{parse_residue}, but returns UNK/Any instead of raising 421 UnknownPDBResidueError. 422 423 @param residue_name: a PDB-conforming name of a residue 424 @type residue_name: str 425 @param as_type: suggest a sequence type (L{SequenceTypes} member) 426 @type L{scb.core.EnumItem} 427 428 @return: a normalized residue name 429 @rtype: str 430 """ 431 try: 432 return self.parse_residue(residue_name, as_type) 433 except UnknownPDBResidueError: 434 if as_type == SequenceTypes.Protein: 435 return repr(SequenceAlphabets.Protein.UNK) 436 elif as_type == SequenceTypes.NucleicAcid: 437 return repr(SequenceAlphabets.Nucleic.Any) 438 else: 439 return repr(SequenceAlphabets.Unknown.UNK)
440
441 - def parse(self, filename=None, model=None):
442 443 if filename: 444 self.filename = filename 445 return self.parse_structure(model)
446
447 - def parse_structure(self, model=None):
448 """ 449 Parse and return the L{Structure} with the specified model identifier. 450 If no explicit model is specified, parse the first model in the 451 structure. 452 453 @param model: parse exactly the model with this ID 454 @type model: str 455 456 @return: object representation of the selected model 457 @rtype: L{Structure} 458 459 @raise ValueError: When an invalid model ID is specified 460 @raise PDBParseError: When the input PDB file suffers from unrecoverable 461 corruption. More specialized exceptions will be 462 raised depending on the context (see L{PDBParseError}'s 463 subclasses). 464 """ 465 if model is not None: 466 model = int(model) 467 468 structure = self._parse_header(model) 469 470 self._parse_atoms(structure, model) 471 self._parse_ss(structure) 472 473 return structure
474
475 - def parse_models(self, models=()):
476 """ 477 Parse the specified models in the file and build an L{Ensemble}. 478 479 @param models: an iterable object providing model identifiers. 480 If not specified, all models will be parsed. 481 @type models: tuple 482 483 @return: an ensemble with all parsed models 484 @rtype: L{Ensemble} 485 """ 486 487 if not models: 488 models = self.models() 489 else: 490 models = list(map(int, models)) 491 492 ensemble = csb.bio.structure.Ensemble() 493 494 if len(models) > 0: 495 for model_id in models: 496 model = self.parse_structure(model_id) 497 ensemble.models.append(model) 498 else: 499 model = self.parse_structure() 500 model.model_id = 1 501 ensemble.models.append(model) 502 503 return ensemble
504
505 - def parse_biomolecule(self, number=1, single=False):
506 """ 507 Parse and return the L{Structure} of the biological unit (quaternary 508 structure) as annotated by the REMARK 350 BIOMOLECULE record. 509 510 @param number: biomolecule number 511 @type number: int 512 513 @param single: if True, assign new single-letter chain 514 identifiers. If False, assign multi-letter chain identifiers whith a 515 number appended to the original identifier, like "A1", "A2", ... 516 @type single: bool 517 518 @return: structure of biological unit 519 @rtype: L{Structure} 520 """ 521 remarks = self._parse_remarks() 522 if 350 not in remarks: 523 raise PDBParseError('There is no REMARK 350') 524 525 current = 1 526 biomt = {current: {}} 527 chains = tuple() 528 529 def split(line): 530 return [c.strip() for c in line.split(',') if c.strip() != '']
531 532 for line in remarks[350]: 533 if line.startswith('BIOMOLECULE:'): 534 current = int(line[12:]) 535 biomt[current] = {} 536 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'): 537 chains = tuple(split(line[30:])) 538 elif line.startswith(' AND CHAINS:'): 539 chains += tuple(split(line[30:])) 540 elif line.startswith(' BIOMT'): 541 num = int(line[8:12]) 542 vec = line[12:].split() 543 vec = list(map(float, vec)) 544 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec) 545 546 if number not in biomt or len(biomt[number]) == 0: 547 raise KeyError('no BIOMOLECULE number {0}'.format(number)) 548 549 asu = self.parse_structure() 550 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number)) 551 552 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') 553 554 for chains, matrices in biomt[number].items(): 555 for num in matrices: 556 mat = numpy.array(matrices[num][0:12]).reshape((3,4)) 557 R, t = mat[:3,:3], mat[:3,3] 558 559 for chain in chains: 560 if chain not in asu: 561 raise PDBParseError('chain {0} missing'.format(chain)) 562 copy = asu[chain].clone() 563 copy.transform(R, t) 564 if single: 565 if len(structure.chains) == 62: 566 raise ValueError('too many chains for single=True') 567 copy.id = next(newchainiditer) 568 else: 569 copy.id = '{0}{1}'.format(chain, num) 570 structure.chains.append(copy) 571 572 return structure
573 574 @abstractmethod
575 - def _parse_header(self, model):
576 """ 577 An abstract method, which subclasses must use to customize the way the 578 PDB header (or the absence of a such) is handled. The implementation 579 must rely on reading character data from the current internal 580 self._stream and must return a new L{csb.bio.structure.Structure} 581 instance with properly initialized header data: accession, model, 582 molecule identifiers, chains and residues. This structure object is 583 then internally passed to the C{_parse_atoms} hook, responsible for 584 attachment of the atoms to the residues in the structure. 585 """ 586 pass
587
588 - def _scroll_model(self, model, stream):
589 """ 590 Scroll the C{stream} to the specified C{model}. 591 """ 592 593 while True: 594 try: 595 line = next(stream) 596 except StopIteration: 597 raise ValueError('No such model {0} in the structure.'.format(model)) 598 599 if line.startswith('MODEL'): 600 model_id = int(line[10:14]) 601 if model == model_id: 602 return model_id
603
604 - def _parse_atoms(self, structure, model):
605 """ 606 Parse the ATOMs from the specified C{model} and attach them to the 607 C{structure}. 608 """ 609 610 structure.model_id = None 611 612 atoms = dict( (chain, []) for chain in structure.chains ) 613 chains = set() 614 total_chains = len([c for c in structure.items if c.length > 0]) 615 het_residues = dict( (chain, set()) for chain in structure.chains ) 616 in_ligands = False 617 in_atom = False 618 619 self._stream.seek(0) 620 while True: 621 622 try: 623 line = next(self._stream) 624 except StopIteration: 625 break 626 627 if line.startswith('HET '): 628 het_residue, het_chain, het_residue_id = line[7:10].strip(), line[12], line[13:18].strip() 629 630 if het_chain in structure: 631 chain = structure.chains[het_chain] 632 if chain.type == SequenceTypes.Protein and het_residue in PDB_AMINOACIDS: 633 het_residues[het_chain].add(het_residue_id) 634 elif chain.type == SequenceTypes.NucleicAcid and het_residue in PDB_NUCLEOTIDES: 635 het_residues[het_chain].add(het_residue_id) 636 637 elif line.startswith('MODEL'): 638 if model and model != int(line[10:14]): 639 self._scroll_model(model, self._stream) 640 structure.model_id = model 641 else: 642 model = int(line[10:14]) 643 structure.model_id = model 644 645 elif line.startswith('ATOM') \ 646 or (line.startswith('HETATM') and not in_ligands): 647 in_atom = True 648 649 rank = int(line[22:26]) 650 serial_number = int(line[6:11]) 651 name = line[12:16] 652 x, y, z = line[30:38], line[38:46], line[46:54] 653 vector = numpy.array([float(x), float(y), float(z)]) 654 element = line[76:78].strip() 655 if element: 656 try: 657 element = csb.core.Enum.parsename(ChemElements, element) 658 except csb.core.EnumMemberError: 659 if element in ('D', 'X'): 660 element = ChemElements.x 661 else: 662 raise StructureFormatError('Unknown chemical element: {0}'.format(element)) 663 else: 664 element = None 665 666 atom = csb.bio.structure.Atom(serial_number, name, element, 667 vector) 668 669 atom._het = line.startswith('HETATM') 670 atom._rank = rank 671 atom._sequence_number = int(line[22:26].strip()) 672 atom._residue_id = str(atom._sequence_number) 673 atom._insertion_code = line[26].strip() 674 if not atom._insertion_code: 675 atom._insertion_code = None 676 else: 677 atom._residue_id += atom._insertion_code 678 679 atom.alternate = line[16].strip() 680 if not atom.alternate: 681 atom.alternate = None 682 683 atom._chain = line[21].strip() 684 if atom._chain not in structure.chains: 685 raise StructureFormatError( 686 'Atom {0}: chain {1} is undefined'.format(serial_number, atom._chain)) 687 chains.add(atom._chain) 688 residue_name = line[17:20].strip() 689 residue_name = self.parse_residue_safe(residue_name, as_type=structure.chains[atom._chain].type) 690 if structure.chains[atom._chain].type == SequenceTypes.NucleicAcid: 691 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Nucleic, residue_name) 692 else: 693 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Protein, residue_name) 694 695 atom.occupancy = float(line[54:60]) 696 atom.bfactor = float(line[60:66]) 697 698 charge = line[78:80].strip() 699 if charge: 700 if charge in ('+', '-'): 701 charge += '1' 702 if charge[-1] in ('+', '-'): 703 charge = charge[::-1] 704 atom.charge = int(charge) 705 706 atoms[atom._chain].append(atom) 707 708 elif in_atom and line.startswith('TER'): 709 in_atom = False 710 if len(chains) == total_chains: 711 in_ligands = True 712 713 elif line.startswith('ENDMDL'): 714 break 715 716 elif line.startswith('END'): 717 break 718 719 if structure.model_id != model: 720 raise ValueError('No such model {0} in the structure.'.format(model)) 721 722 self._map_residues(structure, atoms, het_residues)
723
724 - def _map_residues(self, structure, atoms, het_residues):
725 726 assert set(structure.chains) == set(atoms.keys()) 727 728 for chain in structure.chains: 729 730 subject = structure.chains[chain].sequence 731 query = '' 732 fragments = [] 733 734 seq_numbers = [] 735 lookup = {} 736 737 i = -1 738 for a in atoms[chain]: 739 if a._residue_id not in lookup: 740 i += 1 741 lookup[a._residue_id] = [a._sequence_number, a._insertion_code] 742 seq_numbers.append(a._residue_id) 743 res_name = a._residue_name.value 744 res_id = '{0}{1}'.format(a._sequence_number or '', a._insertion_code or '').strip() 745 if a._het and res_id not in het_residues[chain]: 746 # if it is a HETATM, but not a modified residue, initiate an optional fragment 747 fragments.append([res_name, '?']) 748 elif a._insertion_code and not (i > 0 and lookup[seq_numbers[i - 1]][1]): 749 fragments.append([res_name]) 750 elif i == 0 or a._sequence_number - lookup[seq_numbers[i - 1]][0] not in (0, 1, -1): 751 # if residues [i, i-1] are not consecutive or 'overlapping', initiate a new fragment: 752 fragments.append([res_name]) 753 else: 754 # then they are consecutive 755 if fragments[-1][-1] == '?': 756 # but the prev was optional (maybe HET), all optionals *MUST* reside in 757 # singleton fragments, so start a new fragment 758 fragments.append([res_name]) 759 else: 760 # append the new residue to the end of the last fragment 761 fragments[-1].append(res_name) 762 763 for i, frag in enumerate(fragments): 764 fragments[i] = ''.join(frag) 765 if len(fragments) > 100: 766 # Python's regex engine will crash with more than 100 groups 767 raise StructureFormatError("Can't map structure with more than 100 fragments in ATOMS") 768 query = '^.*?({0}).*?$'.format(').*?('.join(fragments)) 769 770 matches = re.match(query, subject) 771 772 if matches: 773 seq_numitem = -1 774 for frag_number, frag in enumerate(matches.groups(), start=1): 775 if frag is '': 776 # optional fragment, which did not match (NOTE: all optionals must occupy 777 # their own fragments of length 1 residue) 778 seq_numitem += 1 # this 1 implies that all optional fragments are 1 residue long 779 else: 780 for i, dummy in enumerate(frag, start=1): 781 seq_numitem += 1 782 # lookup[res_id] is finally set to give the final rank of residue under id res_id: 783 try: 784 lookup[ seq_numbers[seq_numitem] ] = matches.start(frag_number) + i 785 except: 786 raise 787 788 fixed_residue = None 789 for atom in atoms[chain]: 790 if not isinstance(lookup[atom._residue_id], int): 791 continue # this atom was not mapped (e.g. HET) 792 atom._rank = lookup[atom._residue_id] 793 residue = structure.chains[chain].residues[atom._rank] 794 if residue is not fixed_residue: 795 residue.id = atom._sequence_number, atom._insertion_code 796 fixed_residue = residue 797 798 assert str(residue.type) == subject[atom._rank - 1] 799 residue.atoms.append(atom) 800 801 del atom._rank 802 del atom._insertion_code 803 del atom._sequence_number 804 del atom._chain 805 del atom._residue_id 806 del atom._residue_name 807 else: 808 if structure.chains[chain].length == 0 and len(atoms[chain]) > 0: 809 raise StructureFormatError("Can't add atoms: chain {0} has no residues".format(chain)) 810 else: 811 raise StructureFormatError("Can't map atoms")
812
813 - def _parse_ss(self, structure):
814 """ 815 Parse and attach secondary structure data. 816 817 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated 818 as a regular SecStructures.Helix. This is due to incompatibility 819 between DSSP and PDB helix types. 820 @todo: Implement a proper workaround for the previous bug (e.g. skip all 821 helices types not included in the DSSP enum) 822 823 @warning: In this implementation only the start/end positions of the SS 824 elements are parsed. Additional data like H-bonding is ignored. 825 826 @bug: Currently structure.to_pdb() is not writing any SS data. 827 """ 828 elements = {} 829 self._stream.seek(0) 830 831 while True: 832 try: 833 line = next(self._stream) 834 except StopIteration: 835 break 836 837 if line.startswith('HELIX'): 838 839 chain = structure.chains[line[19].strip()] 840 if chain.id not in elements: 841 elements[chain.id] = [] 842 if chain.id != line[31].strip(): 843 if self._check_ss: 844 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10])) 845 else: 846 continue 847 try: 848 startres = chain.find(line[21:25].strip(), line[25].strip()) 849 endres = chain.find(line[33:37].strip(), line[37].strip()) 850 except csb.core.ItemNotFoundError as ex: 851 if self._check_ss: 852 raise SecStructureFormatError( 853 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 854 else: 855 continue 856 if not startres.rank <= endres.rank: 857 if self._check_ss: 858 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10])) 859 else: 860 continue 861 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix) 862 elements[chain.id].append(helix) 863 864 if line.startswith('SHEET'): 865 866 chain = structure.chains[line[21].strip()] 867 if chain.id not in elements: 868 elements[chain.id] = [] 869 if chain.id != line[32].strip(): 870 if self._check_ss: 871 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10])) 872 else: 873 continue 874 try: 875 startres = chain.find(line[22:26].strip(), line[26].strip()) 876 endres = chain.find(line[33:37].strip(), line[37].strip()) 877 except csb.core.ItemNotFoundError as ex: 878 if self._check_ss: 879 raise SecStructureFormatError( 880 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 881 else: 882 continue 883 if not startres.rank <= endres.rank: 884 if self._check_ss: 885 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10])) 886 else: 887 continue 888 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand) 889 elements[chain.id].append(strand) 890 891 elif line.startswith('MODEL') or line.startswith('ATOM'): 892 break 893 894 for chain_id in elements: 895 ss = csb.bio.structure.SecondaryStructure() 896 for e in elements[chain_id]: 897 ss.append(e) 898 structure.chains[chain_id].secondary_structure = ss
899
900 - def _parse_remarks(self):
901 """ 902 Read REMARK lines from PDB file. 903 904 @return: dictionary with remark numbers as keys, and lists of lines as values. 905 @rtype: dict 906 """ 907 self._stream.seek(0) 908 909 remarks = {} 910 911 for line in self._stream: 912 if line.startswith('REMARK'): 913 num = int(line[7:10]) 914 lstring = line[11:] 915 remarks.setdefault(num, []).append(lstring) 916 elif line.startswith('DBREF') or line.startswith('ATOM'): 917 break 918 919 return remarks
920
921 922 -class RegularStructureParser(AbstractStructureParser):
923 """ 924 This is the de facto PDB parser, which is designed to read SEQRES and ATOM 925 sections separately, and them map them. Intentionally fails to parse 926 malformed PDB files, e.g. a PDB file without a HEADER section. 927 """ 928
929 - def _parse_header(self, model):
930 """ 931 Parse the HEADER section of a regular PDB file. 932 933 @return: a L{csb.bio.structure.Structure} instance with properly 934 initialized residues from the SEQRES. 935 @rtype: L{csb.bio.structure.Structure} 936 937 @raise PDBParseError: if the stream has no HEADER at byte 0 938 """ 939 940 self._stream.seek(0) 941 942 header = next(self._stream) 943 if not header.startswith('HEADER'): 944 raise PDBParseError('Does not look like a regular PDB file.') 945 946 structure = csb.bio.structure.Structure(header.split()[-1]) 947 948 while True: 949 950 try: 951 line = next(self._stream) 952 except StopIteration: 953 break 954 955 if line.startswith('COMPND'): 956 if line[10:].lstrip().startswith('MOL_ID:'): 957 mol_id = int(line[18:].replace(';', '').strip()) 958 chain_name = '' 959 chains = '' 960 while line.startswith('COMPND'): 961 line = next(self._stream) 962 if line.split()[2].startswith('MOLECULE:'): 963 chain_name += line[20:].strip() 964 while not chain_name.endswith(';'): 965 line = next(self._stream) 966 if not line.startswith('COMPND'): 967 break 968 chain_name += ' ' + line[11:].strip() 969 else: 970 while not line.split()[2].startswith('CHAIN:'): 971 line = next(self._stream) 972 if not line.startswith('COMPND'): 973 raise HeaderFormatError('Missing chain identifier in COMPND section') 974 chains = line[17:].strip() 975 while not chains.endswith(';'): 976 line = next(self._stream) 977 if not line.startswith('COMPND'): 978 break 979 chains += ', ' + line[11:].strip() 980 break 981 982 chain_name = chain_name.strip()[:-1] 983 for chain in chains.replace(';', ' ').replace(',', ' ').split() or ['']: # the second part deals with an empty chain id 984 new_chain = csb.bio.structure.Chain(chain, type=SequenceTypes.Unknown, 985 name=chain_name, accession=structure.accession) 986 new_chain.molecule_id = mol_id 987 try: 988 structure.chains.append(new_chain) 989 except csb.core.DuplicateKeyError: 990 raise HeaderFormatError('Chain {0} is already defined.'.format(new_chain.id)) 991 992 elif line.startswith('SEQRES'): 993 # Correct handling of empty chain id 994 seq_fields = [line[7:10], line[11], line[13:17] ] 995 seq_fields.extend(line[18:].split()) 996 997 rank_base = int(seq_fields[0]) 998 chain_id = seq_fields[1].strip() 999 1000 if chain_id not in structure.chains: 1001 raise HeaderFormatError('Chain {0} is undefined'.format(chain_id)) 1002 1003 chain = structure.chains[chain_id] 1004 1005 if chain.type == SequenceTypes.Unknown: 1006 inner_residuerank = int(len(seq_fields[3:]) / 2) + 3 1007 for i in [inner_residuerank, 3, -1]: 1008 try: 1009 chain.type = self.guess_sequence_type(seq_fields[i]) 1010 break 1011 except UnknownPDBResidueError: 1012 pass 1013 1014 for i, residue_name in enumerate(seq_fields[3:]): 1015 rank = rank_base * 13 - (13 - (i + 1)) 1016 rname = self.parse_residue_safe(residue_name, as_type=chain.type) 1017 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname) 1018 residue._pdb_name = residue_name 1019 structure.chains[chain_id].residues.append(residue) 1020 assert structure.chains[chain_id].residues.last_index == rank 1021 1022 elif line.startswith('MODEL') or line.startswith('ATOM'): 1023 break 1024 1025 return structure
1026
1027 1028 -class PDBHeaderParser(RegularStructureParser):
1029 """ 1030 Ultra fast PDB HEADER parser. Does not read any structural data. 1031 """ 1032
1033 - def _parse_atoms(self, structure, model):
1034 pass
1035
1036 - def _parse_ss(self, structure):
1037 pass
1038
1039 - def _parse_header(self, model):
1040 return super(PDBHeaderParser, self)._parse_header(model)
1041
1042 1043 -class LegacyStructureParser(AbstractStructureParser):
1044 """ 1045 This is a customized PDB parser, which is designed to read both sequence and 1046 atom data from the ATOM section. This is especially useful when parsing PDB 1047 files without a header. 1048 """ 1049
1050 - def _parse_header(self, model):
1051 """ 1052 Initialize a structure with residues from the ATOMs section. 1053 1054 @param model: model identifier (e.g. if multiple models exist) 1055 @type model: str 1056 1057 @return: a L{csb.bio.structure.Structure} instance with properly 1058 initialized residues from ATOMs under the specified C{model}. 1059 @rtype: L{csb.bio.structure.Structure} 1060 """ 1061 1062 self._stream.seek(0) 1063 in_atom = False 1064 has_atoms = False 1065 chains = csb.core.OrderedDict() 1066 1067 header = next(self._stream) 1068 if header.startswith('HEADER'): 1069 structure = csb.bio.structure.Structure(header.split()[-1]) 1070 else: 1071 self._stream.seek(0) 1072 structure = csb.bio.structure.Structure('NONE') 1073 1074 structure.model_id = None 1075 1076 while True: 1077 1078 try: 1079 line = next(self._stream) 1080 except StopIteration: 1081 break 1082 1083 if line.startswith('MODEL'): 1084 if model and model != int(line[10:14]): 1085 self._scroll_model(model, self._stream) 1086 structure.model_id = model 1087 else: 1088 model = int(line[10:14]) 1089 structure.model_id = model 1090 1091 elif line.startswith('ATOM') \ 1092 or (in_atom and line.startswith('HETATM')): 1093 in_atom = True 1094 has_atoms = True 1095 1096 residue_id = line[22:27].strip() 1097 residue_name = line[17:20].strip() 1098 chain_id = line[21].strip() 1099 1100 if chain_id not in chains: 1101 chains[chain_id] = csb.core.OrderedDict() 1102 1103 new_chain = csb.bio.structure.Chain( 1104 chain_id, 1105 type=SequenceTypes.Unknown, 1106 accession=structure.accession) 1107 new_chain.molecule_id = '1' 1108 structure.chains.append(new_chain) 1109 1110 if residue_id not in chains[chain_id]: 1111 chains[chain_id][residue_id] = residue_name 1112 1113 if structure.chains[chain_id].type == SequenceTypes.Unknown: 1114 try: 1115 structure.chains[chain_id].type = self.guess_sequence_type(residue_name) 1116 except UnknownPDBResidueError: 1117 pass 1118 1119 elif in_atom and line.startswith('TER'): 1120 in_atom = False 1121 1122 elif line.startswith('ENDMDL'): 1123 break 1124 1125 elif line.startswith('END'): 1126 break 1127 1128 if not has_atoms: 1129 raise HeaderFormatError("Can't parse legacy structure: no ATOMs found") 1130 1131 for chain_id in structure.chains: 1132 chain = structure.chains[chain_id] 1133 1134 for residue_id in chains[chain_id]: 1135 residue_name = chains[chain_id][residue_id] 1136 rank = (chain.residues.last_index or 0) + 1 1137 1138 rname = self.parse_residue_safe(residue_name, as_type=structure.chains[chain_id].type) 1139 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname) 1140 residue._pdb_name = residue_name 1141 chain.residues.append(residue) 1142 1143 return structure
1144 1145 1146 StructureParser = AbstractStructureParser.create_parser 1147 """ 1148 Alias for L{AbstractStructureParser.create_parser}. 1149 """
1150 1151 -class FileBuilder(object):
1152 """ 1153 Base abstract files for all structure file formatters. 1154 Defines a common step-wise interface according to the Builder pattern. 1155 1156 @param output: output stream (this is where the product is constructed) 1157 @type output: stream 1158 """ 1159 1160 __metaclass__ = ABCMeta 1161
1162 - def __init__(self, output):
1163 1164 if not hasattr(output, 'write'): 1165 raise TypeError(output) 1166 1167 def isnull(this, that, null=None): 1168 if this is null: 1169 return that 1170 else: 1171 return this
1172 1173 self._out = output 1174 self._isnull = isnull
1175 1176 @property
1177 - def output(self):
1178 """ 1179 Destination stream 1180 @rtype: stream 1181 """ 1182 return self._out
1183 1184 @property
1185 - def isnull(self):
1186 """ 1187 ISNULL(X, Y) function 1188 @rtype: callable 1189 """ 1190 return self._isnull
1191
1192 - def write(self, text):
1193 """ 1194 Write a chunk of text 1195 """ 1196 self._out.write(text)
1197
1198 - def writeline(self, text):
1199 """ 1200 Write a chunk of text and append a new line terminator 1201 """ 1202 self._out.write(text) 1203 self._out.write('\n')
1204 1205 @abstractmethod
1206 - def add_header(self, master_structure):
1207 pass
1208 1209 @abstractmethod
1210 - def add_structure(self, structure):
1211 pass
1212
1213 - def finalize(self):
1214 pass
1215
1216 -class PDBFileBuilder(FileBuilder):
1217 """ 1218 PDB file format builder. 1219 """ 1220
1221 - def writeline(self, text):
1222 self.write('{0:80}\n'.format(text))
1223
1224 - def add_header(self, master):
1225 """ 1226 Write the HEADER of the file using C{master} 1227 1228 @type master: L{Structure} 1229 """ 1230 1231 isnull = self.isnull 1232 1233 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}' 1234 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper())) 1235 1236 molecules = { } 1237 1238 for chain_id in master.chains: 1239 chain = master.chains[chain_id] 1240 if chain.molecule_id not in molecules: 1241 molecules[chain.molecule_id] = [ ] 1242 molecules[chain.molecule_id].append(chain_id) 1243 1244 k = 0 1245 for mol_id in sorted(molecules): 1246 1247 chains = molecules[mol_id] 1248 first_chain = master.chains[ chains[0] ] 1249 1250 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0'))) 1251 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, ''))) 1252 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains))) 1253 k += 3 1254 1255 for chain_id in master.chains: 1256 1257 chain = master.chains[chain_id] 1258 res = [ r._pdb_name for r in chain.residues ] 1259 1260 rn = 0 1261 for j in range(0, chain.length, 13): 1262 rn += 1 1263 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ] 1264 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format( 1265 rn, chain.id, chain.length, ' '.join(residues) ))
1266
1267 - def add_structure(self, structure):
1268 """ 1269 Append a new model to the file 1270 1271 @type structure: L{Structure} 1272 """ 1273 1274 isnull = self.isnull 1275 1276 for chain_id in structure.chains: 1277 1278 chain = structure.chains[chain_id] 1279 for residue in chain.residues: 1280 1281 atoms = [ ] 1282 for an in residue.atoms: 1283 atom = residue.atoms[an] 1284 if isinstance(atom, csb.bio.structure.DisorderedAtom): 1285 for dis_atom in atom: atoms.append(dis_atom) 1286 else: 1287 atoms.append(atom) 1288 atoms.sort() 1289 1290 for atom in atoms: 1291 1292 alt = atom.alternate 1293 if alt is True: 1294 alt = 'A' 1295 elif alt is False: 1296 alt = ' ' 1297 1298 if atom.element: 1299 element = repr(atom.element) 1300 else: 1301 element = ' ' 1302 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format( 1303 atom.serial_number, atom._full_name, isnull(alt, ' '), 1304 residue._pdb_name, chain.id, 1305 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '), 1306 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.bfactor, 0.0), 1307 element, isnull(atom.charge, ' ') )) 1308 1309 self.writeline('TER')
1310
1311 - def finalize(self):
1312 """ 1313 Add the END marker 1314 """ 1315 self.writeline('END') 1316 self._out.flush()
1317
1318 -class PDBEnsembleFileBuilder(PDBFileBuilder):
1319 """ 1320 Supports serialization of NMR ensembles. 1321 1322 Functions as a simple decorator, which wraps C{add_structure} with 1323 MODEL/ENDMDL records. 1324 """ 1325
1326 - def add_structure(self, structure):
1327 1328 model_id = self.isnull(structure.model_id, 1) 1329 1330 self.writeline('MODEL {0:>4}'.format(model_id)) 1331 super(PDBEnsembleFileBuilder, self).add_structure(structure) 1332 self.writeline('ENDMDL')
1333
1334 1335 -class StructureProvider(object):
1336 """ 1337 Base class for all PDB data source providers. 1338 1339 Concrete classes need to implement the C{find} method, which abstracts the 1340 retrieval of a PDB structure file by a structure identifier. This is a hook 1341 method called internally by C{get}, but subclasses can safely override both 1342 C{find} and {get} to in order to achieve completely custom behavior. 1343 """ 1344 1345 __metaclass__ = ABCMeta 1346
1347 - def __getitem__(self, id):
1348 return self.get(id)
1349 1350 @abstractmethod
1351 - def find(self, id):
1352 """ 1353 Attempt to discover a PDB file, given a specific PDB C{id}. 1354 1355 @param id: structure identifier (e.g. 1x80) 1356 @type id: str 1357 @return: path and file name on success, None otherwise 1358 @rtype: str or None 1359 """ 1360 pass
1361
1362 - def get(self, id, model=None):
1363 """ 1364 Discover, parse and return the PDB structure, corresponding to the 1365 specified C{id}. 1366 1367 @param id: structure identifier (e.g. 1x80) 1368 @type id: str 1369 @param model: optional model identifier 1370 @type model: str 1371 1372 @rtype: L{csb.bio.Structure} 1373 1374 @raise StructureNotFoundError: when C{id} could not be found 1375 """ 1376 pdb = self.find(id) 1377 1378 if pdb is None: 1379 raise StructureNotFoundError(id) 1380 else: 1381 return StructureParser(pdb).parse_structure(model=model)
1382
1383 -class FileSystemStructureProvider(StructureProvider):
1384 """ 1385 Simple file system based PDB data source. Scans a list of local directories 1386 using pre-defined file name templates. 1387 1388 @param paths: a list of paths 1389 @type paths: iterable or str 1390 """ 1391
1392 - def __init__(self, paths=None):
1393 1394 self._templates = ['pdb{id}.ent', 'pdb{id}.pdb', '{id}.pdb', '{id}.ent'] 1395 self._paths = csb.core.OrderedDict() 1396 1397 if paths is not None: 1398 if isinstance(paths, csb.core.string): 1399 paths = [paths] 1400 1401 for path in paths: 1402 self.add(path)
1403 1404 @property
1405 - def paths(self):
1406 """ 1407 Current search paths 1408 @rtype: tuple 1409 """ 1410 return tuple(self._paths)
1411 1412 @property
1413 - def templates(self):
1414 """ 1415 Current file name match templates 1416 @rtype: tuple 1417 """ 1418 return tuple(self._templates)
1419
1420 - def add(self, path):
1421 """ 1422 Register a new local C{path}. 1423 1424 @param path: directory name 1425 @type path: str 1426 1427 @raise IOError: if C{path} is not a valid directory 1428 """ 1429 if os.path.isdir(path): 1430 self._paths[path] = path 1431 else: 1432 raise IOError(path)
1433
1434 - def add_template(self, template):
1435 """ 1436 Register a custom file name name C{template}. The template must contain 1437 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent 1438 1439 @param template: pattern 1440 @type template: str 1441 """ 1442 1443 if '{id}' not in template: 1444 raise ValueError('Template does not contain an "{id}" macro') 1445 1446 if template not in self._templates: 1447 self._templates.append(template)
1448
1449 - def remove(self, path):
1450 """ 1451 Unregister an existing local C{path}. 1452 1453 @param path: directory name 1454 @type path: str 1455 1456 @raise ValueError: if C{path} had not been registered 1457 """ 1458 if path not in self._paths: 1459 raise ValueError('path not found: {0}'.format(path)) 1460 1461 del self._paths[path]
1462
1463 - def find(self, id):
1464 1465 for path in self._paths: 1466 for token in self.templates: 1467 fn = os.path.join(path, token.format(id=id)) 1468 if os.path.exists(fn): 1469 return fn 1470 1471 return None
1472
1473 -class RemoteStructureProvider(StructureProvider):
1474 """ 1475 Retrieves PDB structures from a specified remote URL. 1476 The URL requested from remote server takes the form: <prefix>/<ID><suffix> 1477 1478 @param prefix: URL prefix, including protocol 1479 @type prefix: str 1480 @param suffix: optional URL suffix (.ent by default) 1481 @type suffix: str 1482 """ 1483
1484 - def __init__(self, prefix='http://www.rcsb.org/pdb/files/pdb', suffix='.ent'):
1485 1486 self._prefix = None 1487 self._suffix = None 1488 1489 self.prefix = prefix 1490 self.suffix = suffix
1491 1492 @property
1493 - def prefix(self):
1494 """ 1495 Current URL prefix 1496 @rtype: str 1497 """ 1498 return self._prefix
1499 @prefix.setter
1500 - def prefix(self, value):
1501 self._prefix = value
1502 1503 @property
1504 - def suffix(self):
1505 """ 1506 Current URL suffix 1507 @rtype: str 1508 """ 1509 return self._suffix
1510 @suffix.setter
1511 - def suffix(self, value):
1512 self._suffix = value
1513
1514 - def _find(self, id):
1515 1516 try: 1517 return csb.io.urllib.urlopen(self.prefix + id + self.suffix) 1518 except: 1519 raise StructureNotFoundError(id)
1520
1521 - def find(self, id):
1522 1523 stream = self._find(id) 1524 1525 try: 1526 tmp = csb.io.TempFile(dispose=False) 1527 tmp.write(stream.read().decode('utf-8')) 1528 tmp.flush() 1529 return tmp.name 1530 1531 except StructureNotFoundError: 1532 return None 1533 finally: 1534 stream.close()
1535
1536 - def get(self, id, model=None):
1537 1538 stream = self._find(id) 1539 1540 try: 1541 with csb.io.TempFile() as tmp: 1542 tmp.write(stream.read().decode('utf-8')) 1543 tmp.flush() 1544 return StructureParser(tmp.name).parse_structure(model=model) 1545 finally: 1546 stream.close()
1547
1548 -class CustomStructureProvider(StructureProvider):
1549 """ 1550 A custom PDB data source. Functions as a user-defined map of structure 1551 identifiers and their corresponding local file names. 1552 1553 @param files: initialization dictionary of id:file pairs 1554 @type files: dict-like 1555 """ 1556
1557 - def __init__(self, files={}):
1558 1559 self._files = {} 1560 for id in files: 1561 self.add(id, files[id])
1562 1563 @property
1564 - def paths(self):
1565 """ 1566 List of currently registered file names 1567 @rtype: tuple 1568 """ 1569 return tuple(self._files.values())
1570 1571 @property
1572 - def identifiers(self):
1573 """ 1574 List of currently registered structure identifiers 1575 @rtype: tuple 1576 """ 1577 return tuple(self._files)
1578
1579 - def add(self, id, path):
1580 """ 1581 Register a new local C{id}:C{path} pair. 1582 1583 @param id: structure identifier 1584 @type id: str 1585 @param path: path and file name 1586 @type path: str 1587 1588 @raise IOError: if C{path} is not a valid file name 1589 """ 1590 if os.path.isfile(path): 1591 self._files[id] = path 1592 else: 1593 raise IOError(path)
1594
1595 - def remove(self, id):
1596 """ 1597 Unregister an existing structure C{id}. 1598 1599 @param id: structure identifier 1600 @type id: str 1601 1602 @raise ValueError: if C{id} had not been registered 1603 """ 1604 if id not in self._files: 1605 raise ValueError(id) 1606 else: 1607 del self._files[id]
1608
1609 - def find(self, id):
1610 1611 if id in self._files: 1612 return self._files[id] 1613 else: 1614 return None
1615
1616 -def get(accession, model=None, prefix='http://www.rcsb.org/pdb/files/pdb'):
1617 """ 1618 Download and parse a PDB entry. 1619 1620 @param accession: accession number of the entry 1621 @type accession: str 1622 @param model: model identifier 1623 @type model: str 1624 @param prefix: download URL prefix 1625 @type prefix: str 1626 1627 @return: object representation of the selected model 1628 @rtype: L{Structure} 1629 """ 1630 return RemoteStructureProvider(prefix).get(accession, model=model)
1631
1632 -def find(id, paths):
1633 """ 1634 Try to discover a PDB file for PDB C{id} in C{paths}. 1635 1636 @param id: PDB ID of the entry 1637 @type id: str 1638 @param paths: a list of directories to scan 1639 @type paths: list of str 1640 1641 @return: path and file name on success, None otherwise 1642 @rtype: str 1643 """ 1644 return FileSystemStructureProvider(paths).find(id)
1645
1646 1647 -class AsyncParseResult(object):
1648
1649 - def __init__(self, result, exception):
1650 1651 self.result = result 1652 self.exception = exception
1653
1654 - def __repr__(self):
1655 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
1656
1657 1658 -def _parse_async(parser, file, model):
1659 p = parser(file) 1660 return p.parse_structure(model)
1661
1662 1663 -class AsyncStructureParser(object):
1664 """ 1665 Wraps StructureParser in an asynchronous call. Since a new process is 1666 started by Python internally (as opposed to only starting a new thread), 1667 this makes the parser slower, but provides a way to set a parse timeout 1668 limit. 1669 1670 If initialized with more than one worker, supports parallel parsing 1671 through the C{self.parse_async} method. 1672 1673 @param workers: number of worker threads (1 by default) 1674 @type workers: int 1675 """ 1676
1677 - def __init__(self, workers=1):
1678 1679 self._pool = None 1680 self._workers = 1 1681 1682 if int(workers) > 0: 1683 self._workers = int(workers) 1684 else: 1685 raise ValueError(workers) 1686 1687 self._recycle()
1688
1689 - def _recycle(self):
1690 1691 if self._pool: 1692 self._pool.terminate() 1693 1694 self._pool = multiprocessing.Pool(processes=self._workers)
1695
1696 - def parse_structure(self, structure_file, timeout, model=None, 1697 parser=RegularStructureParser):
1698 """ 1699 Call StructureParser.parse_structure() in a separate process and return 1700 the output. Raise TimeoutError if the parser does not respond within 1701 C{timeout} seconds. 1702 1703 @param structure_file: structure file to parse 1704 @type structure_file: str 1705 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 1706 elapse before the parser completes its job 1707 @type timeout: int 1708 @param parser: any implementing L{AbstractStructureParser} class 1709 @type parser: type 1710 1711 @return: parsed structure 1712 @rtype: L{csb.structure.Structure} 1713 """ 1714 1715 r = self.parse_async([structure_file], timeout, model, parser) 1716 if len(r) > 0: 1717 if r[0].exception is not None: 1718 raise r[0].exception 1719 else: 1720 return r[0].result 1721 return None
1722
1723 - def parse_async(self, structure_files, timeout, model=None, 1724 parser=RegularStructureParser):
1725 """ 1726 Call C{self.parse_structure} for a list of structure files 1727 simultaneously. The actual degree of parallelism will depend on the 1728 number of workers specified while constructing the parser object. 1729 1730 @note: Don't be tempted to pass a large list of structures to this 1731 method. Every time a C{TimeoutError} is encountered, the 1732 corresponding worker process in the pool will hang until the 1733 process terminates on its own. During that time, this worker is 1734 unusable. If a sufficiently high number of timeouts occur, the 1735 whole pool of workers will be unsable. At the end of the method 1736 however a pool cleanup is performed and any unusable workers 1737 are 'reactivated'. However, that only happens at B{the end} of 1738 C{parse_async}. 1739 1740 @param structure_files: a list of structure files 1741 @type structure_files: tuple of str 1742 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 1743 elapse before the parser completes its job 1744 @type timeout: int 1745 @param parser: any implementing L{AbstractStructureParser} class 1746 @type parser: type 1747 1748 @return: a list of L{AsyncParseResult} objects 1749 @rtype: list 1750 """ 1751 1752 pool = self._pool 1753 workers = [] 1754 results = [] 1755 1756 for file in list(structure_files): 1757 result = pool.apply_async(_parse_async, [parser, file, model]) 1758 workers.append(result) 1759 1760 hanging = False 1761 for w in workers: 1762 result = AsyncParseResult(None, None) 1763 try: 1764 result.result = w.get(timeout=timeout) 1765 except KeyboardInterrupt as ki: 1766 pool.terminate() 1767 raise ki 1768 except Exception as ex: 1769 result.exception = ex 1770 if isinstance(ex, multiprocessing.TimeoutError): 1771 hanging = True 1772 results.append(result) 1773 1774 if hanging: 1775 self._recycle() 1776 1777 return results
1778