Package csb :: Package bio :: Package io :: Module wwpdb
[frames] | no frames]

Source Code for Module csb.bio.io.wwpdb

   1  """ 
   2  PDB structure parsers, format builders and database providers. 
   3   
   4  The most basic usage is: 
   5   
   6      >>> parser = StructureParser('structure.pdb') 
   7      >>> parser.parse_structure() 
   8      <Structure>     # a Structure object (model) 
   9   
  10  or if this is an NMR ensemble: 
  11       
  12      >>> parser.parse_models() 
  13      <Ensemble>      # an Ensemble object (collection of alternative Structure-s) 
  14       
  15  This module introduces a family of PDB file parsers. The common interface of all 
  16  parsers is defined in L{AbstractStructureParser}. This class has several 
  17  implementations: 
  18   
  19      - L{RegularStructureParser} - handles normal PDB files with SEQRES fields 
  20      - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files, 
  21        which are lacking SEQRES records (initializes all residues from the ATOMs instead) 
  22      - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures 
  23        without coordinates. Useful for reading metadata (e.g. accession numbers or just 
  24        plain SEQRES sequences) with minimum overhead 
  25         
  26  Unless you have a special reason, you should use the L{StructureParser} factory, 
  27  which returns a proper L{AbstractStructureParser} implementation, depending on the 
  28  input PDB file. If the input file looks like a regular PDB file, the factory 
  29  returns a L{RegularStructureParser}, otherwise it instantiates L{RegularStructureParser}. 
  30  L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}. 
  31   
  32  Another important abstraction in this module is L{StructureProvider}. It has several 
  33  implementations which can be used to retrieve PDB L{Structure}s from various sources: 
  34  file system directories, remote URLs, etc. You can easily create your own provider as 
  35  well. See L{StructureProvider} for details. 
  36   
  37  Finally, this module gives you some L{FileBuilder}s, used for text serialization  
  38  of L{Structure}s and L{Ensemble}s: 
  39   
  40      >>> builder = PDBFileBuilder(stream) 
  41      >>> builder.add_header(structure) 
  42      >>> builder.add_structure(structure) 
  43   
  44  where stream is any Python stream, e.g. an open file or sys.stdout. 
  45   
  46  See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these 
  47  objects. 
  48  """ 
  49   
  50  import re 
  51  import os 
  52  import numpy 
  53  import datetime 
  54  import multiprocessing 
  55   
  56  import csb.bio.structure 
  57  import csb.core 
  58  import csb.io 
  59   
  60  from abc import ABCMeta, abstractmethod 
  61  from csb.bio.sequence import SequenceTypes, SequenceAlphabets 
  62  from csb.bio.structure import ChemElements, SecStructures 
  63   
  64   
  65  PDB_AMINOACIDS = { 
  66      'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN', 
  67      'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY', 
  68      'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS', 
  69      'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS', 
  70      'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER', 
  71      'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN', 
  72      'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU', 
  73      'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER', 
  74      'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG', 
  75      'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU', 
  76      'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER', 
  77      'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU', 
  78      'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR', 
  79      'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA', 
  80      'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET', 
  81      'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY', 
  82      'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE', 
  83      'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS', 
  84      'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR', 
  85      '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL', 
  86      'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS', 
  87      'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS', 
  88      'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP', 
  89      'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR', 
  90      'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU', 
  91      'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS', 
  92      'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY', 
  93      'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS', 
  94      'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS', 
  95      'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS', 
  96      'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS', 
  97      'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG', 
  98      'DNP': 'ALA' 
  99      } 
 100  """ 
 101  Dictionary of non-standard amino acids, which could be found in PDB. 
 102  """ 
 103   
 104   
 105  PDB_NUCLEOTIDES = { 
 106      'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine', 
 107       'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine', 
 108       'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine', 
 109       'K': 'Ketone', '  M': 'Amino', 'S': 'Strong', 'W': 'Weak', 
 110       'B': 'NotA', 'D'  : 'NotC', 'H': 'NotG', 'V': 'NotT', 
 111       'N': 'Any', 'X'  : 'Masked' 
 112      } 
 113  """ 
 114  Dictionary of non-standard nucleotides, which could be found in PDB. 
 115  """ 
116 117 -class PDBParseError(ValueError):
118 pass
119
120 -class HeaderFormatError(PDBParseError):
121 pass
122
123 -class SecStructureFormatError(PDBParseError):
124 pass
125
126 -class StructureFormatError(PDBParseError):
127 pass
128
129 -class UnknownPDBResidueError(PDBParseError):
130 pass
131
132 -class StructureNotFoundError(KeyError):
133 pass
134
135 -class InvalidEntryIDError(StructureFormatError):
136 pass
137
138 139 -class EntryID(object):
140 """ 141 Represents a PDB Chain identifier. Implementing classes must define 142 how the original ID is split into accession number and chain ID. 143 144 @param id: identifier 145 @type id: str 146 """ 147 __metaclass__ = ABCMeta 148
149 - def __init__(self, id):
150 151 self._accession = '' 152 self._chain = '' 153 154 id = id.strip() 155 self._accession, self._chain = self.parse(id)
156 157 @staticmethod
158 - def create(id):
159 """ 160 Guess the format of C{id} and parse it. 161 162 @return: a new PDB ID of the appropriate type 163 @rtype: L{EntryID} 164 """ 165 166 if len(id) in (4, 5): 167 return StandardID(id) 168 elif len(id) == 6 and id[4] == '_': 169 return SeqResID(id) 170 else: 171 return DegenerateID(id)
172 173 @abstractmethod
174 - def parse(self, id):
175 """ 176 Split C{id} into accession number and chain ID. 177 178 @param id: PDB identifier 179 @type id: str 180 @return: (accession, chain) 181 @rtype: tuple of str 182 183 @raise InvalidEntryIDError: when C{id} is in an inappropriate format 184 """ 185 pass
186
187 - def format(self):
188 """ 189 @return: the identifier in its original format 190 @rtype: str 191 """ 192 return self.entry_id
193 194 @property
195 - def accession(self):
196 """ 197 Accession number part of the Entry ID 198 @rtype: str 199 """ 200 return self._accession
201 202 @property
203 - def chain(self):
204 """ 205 Chain ID part of the Entry ID 206 @rtype: str 207 """ 208 return self._chain
209 210 @property
211 - def entry_id(self):
212 """ 213 Accession number + Chain ID 214 @rtype: str 215 """ 216 return "{0.accession}{0.chain}".format(self)
217
218 - def __str__(self):
219 return self.entry_id
220
221 -class StandardID(EntryID):
222 """ 223 Standard PDB ID in the following form: xxxxY, where xxxx is the accession 224 number (lower case) and Y is an optional chain identifier. 225 """
226 - def parse(self, id):
227 228 if len(id) not in (4, 5): 229 raise InvalidEntryIDError(id) 230 231 return (id[:4].lower(), id[4:])
232
233 -class DegenerateID(EntryID):
234 """ 235 Looks like a L{StandardID}, except that the accession number may have 236 arbitrary length. 237 """
238 - def parse(self, id):
239 240 if len(id) < 2: 241 raise InvalidEntryIDError(id) 242 243 return (id[:-1].lower(), id[-1])
244
245 -class SeqResID(EntryID):
246 """ 247 Same as a L{StandardID}, but contains an additional underscore between 248 te accession number and the chain identifier. 249 """
250 - def parse(self, id):
251 252 if not (len(id) == 6 and id[4] == '_'): 253 raise InvalidEntryIDError(id) 254 255 return (id[:4].lower(), id[5:])
256
257 - def format(self):
258 return "{0.accession}_{0.chain}".format(self)
259
260 261 -class AbstractStructureParser(object):
262 """ 263 A base PDB structure format-aware parser. Subclasses must implement the 264 internal abstract method C{_parse_header} in order to complete the 265 implementation. 266 267 @param structure_file: the input PD file to parse 268 @type structure_file: str 269 @param check_ss: if True, secondary structure errors in the file will cause 270 L{SecStructureFormatError} exceptions 271 @type check_ss: bool 272 273 @raise IOError: when the input file cannot be found 274 """ 275 276 __metaclass__ = ABCMeta 277 278 @staticmethod
279 - def create_parser(structure_file, check_ss=False):
280 """ 281 A StructureParser factory, which instantiates and returns the proper parser 282 object based on the contents of the PDB file. 283 284 If the file contains a SEQRES section, L{RegularStructureParser} is returned, 285 otherwise L{LegacyStructureParser} is instantiated. In the latter case 286 LegacyStructureParser will read the sequence data directly from the ATOMs. 287 288 @param structure_file: the PDB file to parse 289 @type structure_file: str 290 291 @rtype: L{AbstractStructureParser} 292 """ 293 has_seqres = False 294 295 for line in open(structure_file): 296 if line.startswith('SEQRES'): 297 has_seqres = True 298 break 299 300 if has_seqres: 301 return RegularStructureParser(structure_file) 302 else: 303 return LegacyStructureParser(structure_file)
304
305 - def __init__(self, structure_file, check_ss=False):
306 307 self._file = None 308 self._stream = None 309 self._check_ss = bool(check_ss) 310 311 self.filename = structure_file
312
313 - def __del__(self):
314 try: 315 self._stream.close() 316 except: 317 pass
318 319 @property
320 - def filename(self):
321 """ 322 Current input PDB file name 323 @rtype: str 324 """ 325 return self._file
326 @filename.setter
327 - def filename(self, name):
328 try: 329 stream = open(name) 330 except IOError: 331 raise IOError('File not found: {0}'.format(name)) 332 333 if self._stream: 334 try: 335 self._stream.close() 336 except: 337 pass 338 self._stream = stream 339 self._file = name
340
341 - def models(self):
342 """ 343 Find all available model identifiers in the structure. 344 345 @return: a list of model IDs 346 @rtype: list 347 """ 348 models = [] 349 check = set() 350 351 with open(self._file, 'r') as f: 352 for line in f: 353 if line.startswith('MODEL'): 354 model_id = int(line[10:14]) 355 if model_id in check: 356 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id)) 357 models.append(model_id) 358 check.add(model_id) 359 360 if len(models) > 0: 361 if not(min(check) == 1 and max(check) == len(models)): 362 raise StructureFormatError('Non-consecutive model identifier(s) encountered') 363 return models 364 else: 365 return [None]
366
367 - def guess_sequence_type(self, residue_name):
368 """ 369 Try to guess what is the sequence type of a PDB C{residue_name}. 370 371 @param residue_name: a PDB-conforming name of a residue 372 @type residue_name: str 373 374 @return: a L{SequenceTypes} enum member 375 @rtype: L{csb.core.EnumItem} 376 377 @raise UnknownPDBResidueError: when there is no such PDB residue name 378 in the catalog tables 379 """ 380 if residue_name in PDB_AMINOACIDS: 381 return SequenceTypes.Protein 382 elif residue_name in PDB_NUCLEOTIDES: 383 return SequenceTypes.NucleicAcid 384 else: 385 raise UnknownPDBResidueError(residue_name)
386
387 - def parse_residue(self, residue_name, as_type=None):
388 """ 389 Try to parse a PDB C{residue_name} and return its closest 'normal' 390 string representation. If a sequence type (C{as_type}) is defined, 391 guess the alphabet based on that information, otherwise try first to 392 parse it as a protein residue. 393 394 @param residue_name: a PDB-conforming name of a residue 395 @type residue_name: str 396 @param as_type: suggest a sequence type (L{SequenceTypes} member) 397 @type L{scb.core.EnumItem} 398 399 @return: a normalized residue name 400 @rtype: str 401 402 @raise UnknownPDBResidueError: when there is no such PDB residue name 403 in the catalog table(s) 404 """ 405 if as_type is None: 406 as_type = self.guess_sequence_type(residue_name) 407 408 try: 409 if as_type == SequenceTypes.Protein: 410 return PDB_AMINOACIDS[residue_name] 411 elif as_type == SequenceTypes.NucleicAcid: 412 return PDB_NUCLEOTIDES[residue_name] 413 else: 414 raise UnknownPDBResidueError(residue_name) 415 except KeyError: 416 raise UnknownPDBResidueError(residue_name)
417
418 - def parse_residue_safe(self, residue_name, as_type):
419 """ 420 Same as C{parse_residue}, but returns UNK/Any instead of raising 421 UnknownPDBResidueError. 422 423 @param residue_name: a PDB-conforming name of a residue 424 @type residue_name: str 425 @param as_type: suggest a sequence type (L{SequenceTypes} member) 426 @type L{scb.core.EnumItem} 427 428 @return: a normalized residue name 429 @rtype: str 430 """ 431 try: 432 return self.parse_residue(residue_name, as_type) 433 except UnknownPDBResidueError: 434 if as_type == SequenceTypes.Protein: 435 return repr(SequenceAlphabets.Protein.UNK) 436 elif as_type == SequenceTypes.NucleicAcid: 437 return repr(SequenceAlphabets.Nucleic.Any) 438 else: 439 return repr(SequenceAlphabets.Unknown.UNK)
440
441 - def parse(self, filename=None, model=None):
442 443 if filename: 444 self.filename = filename 445 return self.parse_structure(model)
446
447 - def parse_structure(self, model=None):
448 """ 449 Parse and return the L{Structure} with the specified model identifier. 450 If no explicit model is specified, parse the first model in the 451 structure. 452 453 @param model: parse exactly the model with this ID 454 @type model: str 455 456 @return: object representation of the selected model 457 @rtype: L{Structure} 458 459 @raise ValueError: When an invalid model ID is specified 460 @raise PDBParseError: When the input PDB file suffers from unrecoverable 461 corruption. More specialized exceptions will be 462 raised depending on the context (see L{PDBParseError}'s 463 subclasses). 464 """ 465 if model is not None: 466 model = int(model) 467 468 structure = self._parse_header(model) 469 470 self._parse_atoms(structure, model) 471 self._parse_ss(structure) 472 473 return structure
474
475 - def parse_models(self, models=()):
476 """ 477 Parse the specified models in the file and build an L{Ensemble}. 478 479 @param models: an iterable object providing model identifiers. 480 If not specified, all models will be parsed. 481 @type models: tuple 482 483 @return: an ensemble with all parsed models 484 @rtype: L{Ensemble} 485 """ 486 487 if not models: 488 models = self.models() 489 else: 490 models = map(int, models) 491 492 ensemble = csb.bio.structure.Ensemble() 493 for model_id in models: 494 model = self.parse_structure(model_id) 495 ensemble.models.append(model) 496 497 return ensemble
498
499 - def parse_biomolecule(self, number=1, single=False):
500 """ 501 Parse and return the L{Structure} of the biological unit (quaternary 502 structure) as annotated by the REMARK 350 BIOMOLECULE record. 503 504 @param number: biomolecule number 505 @type number: int 506 507 @param single: if True, assign new single-letter chain 508 identifiers. If False, assign multi-letter chain identifiers whith a 509 number appended to the original identifier, like "A1", "A2", ... 510 @type single: bool 511 512 @return: structure of biological unit 513 @rtype: L{Structure} 514 """ 515 remarks = self._parse_remarks() 516 if 350 not in remarks: 517 raise PDBParseError('There is no REMARK 350') 518 519 current = 1 520 biomt = {current: {}} 521 chains = tuple() 522 523 def split(line): 524 return [c.strip() for c in line.split(',') if c.strip() != '']
525 526 for line in remarks[350]: 527 if line.startswith('BIOMOLECULE:'): 528 current = int(line[12:]) 529 biomt[current] = {} 530 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'): 531 chains = tuple(split(line[30:])) 532 elif line.startswith(' AND CHAINS:'): 533 chains += tuple(split(line[30:])) 534 elif line.startswith(' BIOMT'): 535 num = int(line[8:12]) 536 vec = line[12:].split() 537 vec = list(map(float, vec)) 538 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec) 539 540 if number not in biomt or len(biomt[number]) == 0: 541 raise KeyError('no BIOMOLECULE number {0}'.format(number)) 542 543 asu = self.parse_structure() 544 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number)) 545 546 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') 547 548 for chains, matrices in biomt[number].items(): 549 for num in matrices: 550 mat = numpy.array(matrices[num][0:12]).reshape((3,4)) 551 R, t = mat[:3,:3], mat[:3,3] 552 553 for chain in chains: 554 if chain not in asu: 555 raise PDBParseError('chain {0} missing'.format(chain)) 556 copy = asu[chain].clone() 557 copy.transform(R, t) 558 if single: 559 if len(structure.chains) == 62: 560 raise ValueError('too many chains for single=True') 561 copy.id = next(newchainiditer) 562 else: 563 copy.id = '{0}{1}'.format(chain, num) 564 structure.chains.append(copy) 565 566 return structure
567 568 @abstractmethod
569 - def _parse_header(self, model):
570 """ 571 An abstract method, which subclasses must use to customize the way the 572 PDB header (or the absence of a such) is handled. The implementation 573 must rely on reading character data from the current internal 574 self._stream and must return a new L{csb.bio.structure.Structure} 575 instance with properly initialized header data: accession, model, 576 molecule identifiers, chains and residues. This structure object is 577 then internally passed to the C{_parse_atoms} hook, responsible for 578 attachment of the atoms to the residues in the structure. 579 """ 580 pass
581
582 - def _scroll_model(self, model, stream):
583 """ 584 Scroll the C{stream} to the specified C{model}. 585 """ 586 587 while True: 588 try: 589 line = next(stream) 590 except StopIteration: 591 raise ValueError('No such model {0} in the structure.'.format(model)) 592 593 if line.startswith('MODEL'): 594 model_id = int(line[10:14]) 595 if model == model_id: 596 return model_id
597
598 - def _parse_atoms(self, structure, model):
599 """ 600 Parse the ATOMs from the specified C{model} and attach them to the 601 C{structure}. 602 """ 603 604 structure.model_id = None 605 606 atoms = dict( (chain, []) for chain in structure.chains ) 607 chains = set() 608 total_chains = len([c for c in structure.items if c.length > 0]) 609 het_residues = dict( (chain, set()) for chain in structure.chains ) 610 in_ligands = False 611 in_atom = False 612 613 self._stream.seek(0) 614 while True: 615 616 try: 617 line = next(self._stream) 618 except StopIteration: 619 break 620 621 if line.startswith('HET '): 622 het_residue, het_chain, het_residue_id = line[7:10].strip(), line[12], line[13:18].strip() 623 624 if het_chain in structure: 625 chain = structure.chains[het_chain] 626 if chain.type == SequenceTypes.Protein and het_residue in PDB_AMINOACIDS: 627 het_residues[het_chain].add(het_residue_id) 628 elif chain.type == SequenceTypes.NucleicAcid and het_residue in PDB_NUCLEOTIDES: 629 het_residues[het_chain].add(het_residue_id) 630 631 elif line.startswith('MODEL'): 632 if model and model != int(line[10:14]): 633 self._scroll_model(model, self._stream) 634 structure.model_id = model 635 else: 636 model = int(line[10:14]) 637 structure.model_id = model 638 639 elif line.startswith('ATOM') \ 640 or (line.startswith('HETATM') and not in_ligands): 641 in_atom = True 642 643 rank = int(line[22:26]) 644 serial_number = int(line[6:11]) 645 name = line[12:16] 646 x, y, z = line[30:38], line[38:46], line[46:54] 647 vector = numpy.array([float(x), float(y), float(z)]) 648 element = line[76:78].strip() 649 if element: 650 try: 651 element = csb.core.Enum.parsename(ChemElements, element) 652 except csb.core.EnumMemberError: 653 if element in ('D', 'X'): 654 element = ChemElements.x 655 else: 656 raise StructureFormatError('Unknown chemical element: {0}'.format(element)) 657 else: 658 element = None 659 660 atom = csb.bio.structure.Atom(serial_number, name, element, 661 vector) 662 663 atom._het = line.startswith('HETATM') 664 atom._rank = rank 665 atom._sequence_number = int(line[22:26].strip()) 666 atom._residue_id = str(atom._sequence_number) 667 atom._insertion_code = line[26].strip() 668 if not atom._insertion_code: 669 atom._insertion_code = None 670 else: 671 atom._residue_id += atom._insertion_code 672 673 atom.alternate = line[16].strip() 674 if not atom.alternate: 675 atom.alternate = None 676 677 atom._chain = line[21].strip() 678 if atom._chain not in structure.chains: 679 raise StructureFormatError( 680 'Atom {0}: chain {1} is undefined'.format(serial_number, atom._chain)) 681 chains.add(atom._chain) 682 residue_name = line[17:20].strip() 683 residue_name = self.parse_residue_safe(residue_name, as_type=structure.chains[atom._chain].type) 684 if structure.chains[atom._chain].type == SequenceTypes.NucleicAcid: 685 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Nucleic, residue_name) 686 else: 687 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Protein, residue_name) 688 689 atom.occupancy = float(line[54:60]) 690 atom.temperature = float(line[60:66]) 691 692 charge = line[78:80].strip() 693 if charge: 694 if charge in ('+', '-'): 695 charge += '1' 696 if charge[-1] in ('+', '-'): 697 charge = charge[::-1] 698 atom.charge = int(charge) 699 700 atoms[atom._chain].append(atom) 701 702 elif in_atom and line.startswith('TER'): 703 in_atom = False 704 if len(chains) == total_chains: 705 in_ligands = True 706 707 elif line.startswith('ENDMDL'): 708 break 709 710 elif line.startswith('END'): 711 break 712 713 if structure.model_id != model: 714 raise ValueError('No such model {0} in the structure.'.format(model)) 715 716 self._map_residues(structure, atoms, het_residues)
717
718 - def _map_residues(self, structure, atoms, het_residues):
719 720 assert set(structure.chains) == set(atoms.keys()) 721 722 for chain in structure.chains: 723 724 subject = structure.chains[chain].sequence 725 query = '' 726 fragments = [] 727 728 seq_numbers = [] 729 lookup = {} 730 731 i = -1 732 for a in atoms[chain]: 733 if a._residue_id not in lookup: 734 i += 1 735 lookup[a._residue_id] = [a._sequence_number, a._insertion_code] 736 seq_numbers.append(a._residue_id) 737 res_name = a._residue_name.value 738 res_id = '{0}{1}'.format(a._sequence_number or '', a._insertion_code or '').strip() 739 if a._het and res_id not in het_residues[chain]: 740 # if it is a HETATM, but not a modified residue, initiate an optional fragment 741 fragments.append([res_name, '?']) 742 elif a._insertion_code and not (i > 0 and lookup[seq_numbers[i - 1]][1]): 743 fragments.append([res_name]) 744 elif i == 0 or a._sequence_number - lookup[seq_numbers[i - 1]][0] not in (0, 1, -1): 745 # if residues [i, i-1] are not consecutive or 'overlapping', initiate a new fragment: 746 fragments.append([res_name]) 747 else: 748 # then they are consecutive 749 if fragments[-1][-1] == '?': 750 # but the prev was optional (maybe HET), all optionals *MUST* reside in 751 # singleton fragments, so start a new fragment 752 fragments.append([res_name]) 753 else: 754 # append the new residue to the end of the last fragment 755 fragments[-1].append(res_name) 756 757 for i, frag in enumerate(fragments): 758 fragments[i] = ''.join(frag) 759 if len(fragments) > 100: 760 # Python's regex engine will crash with more than 100 groups 761 raise StructureFormatError("Can't map structure with more than 100 fragments in ATOMS") 762 query = '^.*?({0}).*?$'.format(').*?('.join(fragments)) 763 764 matches = re.match(query, subject) 765 766 if matches: 767 seq_numitem = -1 768 for frag_number, frag in enumerate(matches.groups(), start=1): 769 if frag is '': 770 # optional fragment, which did not match (NOTE: all optionals must occupy 771 # their own fragments of length 1 residue) 772 seq_numitem += 1 # this 1 implies that all optional fragments are 1 residue long 773 else: 774 for i, dummy in enumerate(frag, start=1): 775 seq_numitem += 1 776 # lookup[res_id] is finally set to give the final rank of residue under id res_id: 777 try: 778 lookup[ seq_numbers[seq_numitem] ] = matches.start(frag_number) + i 779 except: 780 raise 781 782 fixed_residue = None 783 for atom in atoms[chain]: 784 if not isinstance(lookup[atom._residue_id], int): 785 continue # this atom was not mapped (e.g. HET) 786 atom._rank = lookup[atom._residue_id] 787 residue = structure.chains[chain].residues[atom._rank] 788 if residue is not fixed_residue: 789 residue.id = atom._sequence_number, atom._insertion_code 790 fixed_residue = residue 791 792 assert str(residue.type) == subject[atom._rank - 1] 793 residue.atoms.append(atom) 794 795 del atom._rank 796 del atom._insertion_code 797 del atom._sequence_number 798 del atom._chain 799 del atom._residue_id 800 del atom._residue_name 801 else: 802 if structure.chains[chain].length == 0 and len(atoms[chain]) > 0: 803 raise StructureFormatError("Can't add atoms: chain {0} has no residues".format(chain)) 804 else: 805 raise StructureFormatError("Can't map atoms")
806
807 - def _parse_ss(self, structure):
808 """ 809 Parse and attach secondary structure data. 810 811 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated 812 as a regular SecStructures.Helix. This is due to incompatibility 813 between DSSP and PDB helix types. 814 @todo: Implement a proper workaround for the previous bug (e.g. skip all 815 helices types not included in the DSSP enum) 816 817 @warning: In this implementation only the start/end positions of the SS 818 elements are parsed. Additional data like H-bonding is ignored. 819 820 @bug: Currently structure.to_pdb() is not writing any SS data. 821 """ 822 elements = {} 823 self._stream.seek(0) 824 825 while True: 826 try: 827 line = next(self._stream) 828 except StopIteration: 829 break 830 831 if line.startswith('HELIX'): 832 833 chain = structure.chains[line[19].strip()] 834 if chain.id not in elements: 835 elements[chain.id] = [] 836 if chain.id != line[31].strip(): 837 if self._check_ss: 838 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10])) 839 else: 840 continue 841 try: 842 startres = chain.find(line[21:25].strip(), line[25].strip()) 843 endres = chain.find(line[33:37].strip(), line[37].strip()) 844 except csb.core.ItemNotFoundError as ex: 845 if self._check_ss: 846 raise SecStructureFormatError( 847 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 848 else: 849 continue 850 if not startres.rank <= endres.rank: 851 if self._check_ss: 852 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10])) 853 else: 854 continue 855 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix) 856 elements[chain.id].append(helix) 857 858 if line.startswith('SHEET'): 859 860 chain = structure.chains[line[21].strip()] 861 if chain.id not in elements: 862 elements[chain.id] = [] 863 if chain.id != line[32].strip(): 864 if self._check_ss: 865 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10])) 866 else: 867 continue 868 try: 869 startres = chain.find(line[22:26].strip(), line[26].strip()) 870 endres = chain.find(line[33:37].strip(), line[37].strip()) 871 except csb.core.ItemNotFoundError as ex: 872 if self._check_ss: 873 raise SecStructureFormatError( 874 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 875 else: 876 continue 877 if not startres.rank <= endres.rank: 878 if self._check_ss: 879 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10])) 880 else: 881 continue 882 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand) 883 elements[chain.id].append(strand) 884 885 elif line.startswith('MODEL') or line.startswith('ATOM'): 886 break 887 888 for chain_id in elements: 889 ss = csb.bio.structure.SecondaryStructure() 890 for e in elements[chain_id]: 891 ss.append(e) 892 structure.chains[chain_id].secondary_structure = ss
893
894 - def _parse_remarks(self):
895 """ 896 Read REMARK lines from PDB file. 897 898 @return: dictionary with remark numbers as keys, and lists of lines as values. 899 @rtype: dict 900 """ 901 self._stream.seek(0) 902 903 remarks = {} 904 905 for line in self._stream: 906 if line.startswith('REMARK'): 907 num = int(line[7:10]) 908 lstring = line[11:] 909 remarks.setdefault(num, []).append(lstring) 910 elif line.startswith('DBREF') or line.startswith('ATOM'): 911 break 912 913 return remarks
914
915 916 -class RegularStructureParser(AbstractStructureParser):
917 """ 918 This is the de facto PDB parser, which is designed to read SEQRES and ATOM 919 sections separately, and them map them. Intentionally fails to parse 920 malformed PDB files, e.g. a PDB file without a HEADER section. 921 """ 922
923 - def _parse_header(self, model):
924 """ 925 Parse the HEADER section of a regular PDB file. 926 927 @return: a L{csb.bio.structure.Structure} instance with properly 928 initialized residues from the SEQRES. 929 @rtype: L{csb.bio.structure.Structure} 930 931 @raise PDBParseError: if the stream has no HEADER at byte 0 932 """ 933 934 self._stream.seek(0) 935 936 header = next(self._stream) 937 if not header.startswith('HEADER'): 938 raise PDBParseError('Does not look like a regular PDB file.') 939 940 structure = csb.bio.structure.Structure(header.split()[-1]) 941 942 while True: 943 944 try: 945 line = next(self._stream) 946 except StopIteration: 947 break 948 949 if line.startswith('COMPND'): 950 if line[10:].lstrip().startswith('MOL_ID:'): 951 mol_id = int(line[18:].replace(';', '').strip()) 952 chain_name = '' 953 chains = '' 954 while line.startswith('COMPND'): 955 line = next(self._stream) 956 if line.split()[2].startswith('MOLECULE:'): 957 chain_name += line[20:].strip() 958 while not chain_name.endswith(';'): 959 line = next(self._stream) 960 if not line.startswith('COMPND'): 961 break 962 chain_name += ' ' + line[11:].strip() 963 else: 964 while not line.split()[2].startswith('CHAIN:'): 965 line = next(self._stream) 966 if not line.startswith('COMPND'): 967 raise HeaderFormatError('Missing chain identifier in COMPND section') 968 chains = line[17:].strip() 969 while not chains.endswith(';'): 970 line = next(self._stream) 971 if not line.startswith('COMPND'): 972 break 973 chains += ', ' + line[11:].strip() 974 break 975 976 chain_name = chain_name.strip()[:-1] 977 for chain in chains.replace(';', ' ').replace(',', ' ').split() or ['']: # the second part deals with an empty chain id 978 new_chain = csb.bio.structure.Chain(chain, type=SequenceTypes.Unknown, 979 name=chain_name, accession=structure.accession) 980 new_chain.molecule_id = mol_id 981 try: 982 structure.chains.append(new_chain) 983 except csb.core.DuplicateKeyError: 984 raise HeaderFormatError('Chain {0} is already defined.'.format(new_chain.id)) 985 986 elif line.startswith('SEQRES'): 987 # Correct handling of empty chain id 988 seq_fields = [line[7:10], line[11], line[13:17] ] 989 seq_fields.extend(line[18:].split()) 990 991 rank_base = int(seq_fields[0]) 992 chain_id = seq_fields[1].strip() 993 994 if chain_id not in structure.chains: 995 raise HeaderFormatError('Chain {0} is undefined'.format(chain_id)) 996 997 chain = structure.chains[chain_id] 998 999 if chain.type == SequenceTypes.Unknown: 1000 inner_residuerank = int(len(seq_fields[3:]) / 2) + 3 1001 for i in [inner_residuerank, 3, -1]: 1002 try: 1003 chain.type = self.guess_sequence_type(seq_fields[i]) 1004 break 1005 except UnknownPDBResidueError: 1006 pass 1007 1008 for i, residue_name in enumerate(seq_fields[3:]): 1009 rank = rank_base * 13 - (13 - (i + 1)) 1010 rname = self.parse_residue_safe(residue_name, as_type=chain.type) 1011 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname) 1012 residue._pdb_name = residue_name 1013 structure.chains[chain_id].residues.append(residue) 1014 assert structure.chains[chain_id].residues.last_index == rank 1015 1016 elif line.startswith('MODEL') or line.startswith('ATOM'): 1017 break 1018 1019 return structure
1020
1021 1022 -class PDBHeaderParser(RegularStructureParser):
1023 """ 1024 Ultra fast PDB HEADER parser. Does not read any structural data. 1025 """ 1026
1027 - def _parse_atoms(self, structure, model):
1028 pass
1029
1030 - def _parse_ss(self, structure):
1031 pass
1032
1033 - def _parse_header(self, model):
1034 return super(PDBHeaderParser, self)._parse_header(model)
1035
1036 1037 -class LegacyStructureParser(AbstractStructureParser):
1038 """ 1039 This is a customized PDB parser, which is designed to read both sequence and 1040 atom data from the ATOM section. This is especially useful when parsing PDB 1041 files without a header. 1042 """ 1043
1044 - def _parse_header(self, model):
1045 """ 1046 Initialize a structure with residues from the ATOMs section. 1047 1048 @param model: model identifier (e.g. if multiple models exist) 1049 @type model: str 1050 1051 @return: a L{csb.bio.structure.Structure} instance with properly 1052 initialized residues from ATOMs under the specified C{model}. 1053 @rtype: L{csb.bio.structure.Structure} 1054 """ 1055 1056 self._stream.seek(0) 1057 in_atom = False 1058 has_atoms = False 1059 chains = csb.core.OrderedDict() 1060 1061 header = next(self._stream) 1062 if header.startswith('HEADER'): 1063 structure = csb.bio.structure.Structure(header.split()[-1]) 1064 else: 1065 self._stream.seek(0) 1066 structure = csb.bio.structure.Structure('NONE') 1067 1068 structure.model_id = None 1069 1070 while True: 1071 1072 try: 1073 line = next(self._stream) 1074 except StopIteration: 1075 break 1076 1077 if line.startswith('MODEL'): 1078 if model and model != int(line[10:14]): 1079 self._scroll_model(model, self._stream) 1080 structure.model_id = model 1081 else: 1082 model = int(line[10:14]) 1083 structure.model_id = model 1084 1085 elif line.startswith('ATOM') \ 1086 or (in_atom and line.startswith('HETATM')): 1087 in_atom = True 1088 has_atoms = True 1089 1090 residue_id = line[22:27].strip() 1091 residue_name = line[17:20].strip() 1092 chain_id = line[21].strip() 1093 1094 if chain_id not in chains: 1095 chains[chain_id] = csb.core.OrderedDict() 1096 1097 new_chain = csb.bio.structure.Chain( 1098 chain_id, 1099 type=SequenceTypes.Unknown, 1100 accession=structure.accession) 1101 new_chain.molecule_id = '1' 1102 structure.chains.append(new_chain) 1103 1104 if residue_id not in chains[chain_id]: 1105 chains[chain_id][residue_id] = residue_name 1106 1107 if structure.chains[chain_id].type == SequenceTypes.Unknown: 1108 try: 1109 structure.chains[chain_id].type = self.guess_sequence_type(residue_name) 1110 except UnknownPDBResidueError: 1111 pass 1112 1113 elif in_atom and line.startswith('TER'): 1114 in_atom = False 1115 1116 elif line.startswith('ENDMDL'): 1117 break 1118 1119 elif line.startswith('END'): 1120 break 1121 1122 if not has_atoms: 1123 raise HeaderFormatError("Can't parse legacy structure: no ATOMs found") 1124 1125 for chain_id in structure.chains: 1126 chain = structure.chains[chain_id] 1127 1128 for residue_id in chains[chain_id]: 1129 residue_name = chains[chain_id][residue_id] 1130 rank = (chain.residues.last_index or 0) + 1 1131 1132 rname = self.parse_residue_safe(residue_name, as_type=structure.chains[chain_id].type) 1133 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname) 1134 residue._pdb_name = residue_name 1135 chain.residues.append(residue) 1136 1137 return structure
1138 1139 1140 StructureParser = AbstractStructureParser.create_parser 1141 """ 1142 Alias for L{AbstractStructureParser.create_parser}. 1143 """
1144 1145 -class FileBuilder(object):
1146 """ 1147 Base abstract files for all structure file formatters. 1148 Defines a common step-wise interface according to the Builder pattern. 1149 1150 @param output: output stream (this is where the product is constructed) 1151 @type output: stream 1152 """ 1153 1154 __metaclass__ = ABCMeta 1155
1156 - def __init__(self, output):
1157 1158 if not hasattr(output, 'write'): 1159 raise TypeError(output) 1160 1161 def isnull(this, that, null=None): 1162 if this is null: 1163 return that 1164 else: 1165 return this
1166 1167 self._out = output 1168 self._isnull = isnull
1169 1170 @property
1171 - def output(self):
1172 """ 1173 Destination stream 1174 @rtype: stream 1175 """ 1176 return self._out
1177 1178 @property
1179 - def isnull(self):
1180 """ 1181 ISNULL(X, Y) function 1182 @rtype: callable 1183 """ 1184 return self._isnull
1185
1186 - def write(self, text):
1187 """ 1188 Write a chunk of text 1189 """ 1190 self._out.write(text)
1191
1192 - def writeline(self, text):
1193 """ 1194 Write a chunk of text and append a new line terminator 1195 """ 1196 self._out.write(text) 1197 self._out.write('\n')
1198 1199 @abstractmethod
1200 - def add_header(self, master_structure):
1201 pass
1202 1203 @abstractmethod
1204 - def add_structure(self, structure):
1205 pass
1206
1207 - def finalize(self):
1208 pass
1209
1210 -class PDBFileBuilder(FileBuilder):
1211 """ 1212 PDB file format builder. 1213 """ 1214
1215 - def writeline(self, text):
1216 self.write('{0:80}\n'.format(text))
1217
1218 - def add_header(self, master):
1219 """ 1220 Write the HEADER of the file using C{master} 1221 1222 @type master: L{Structure} 1223 """ 1224 1225 isnull = self.isnull 1226 1227 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}' 1228 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper())) 1229 1230 molecules = { } 1231 1232 for chain_id in master.chains: 1233 chain = master.chains[chain_id] 1234 if chain.molecule_id not in molecules: 1235 molecules[chain.molecule_id] = [ ] 1236 molecules[chain.molecule_id].append(chain_id) 1237 1238 k = 0 1239 for mol_id in sorted(molecules): 1240 1241 chains = molecules[mol_id] 1242 first_chain = master.chains[ chains[0] ] 1243 1244 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0'))) 1245 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, ''))) 1246 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains))) 1247 k += 3 1248 1249 for chain_id in master.chains: 1250 1251 chain = master.chains[chain_id] 1252 res = [ r._pdb_name for r in chain.residues ] 1253 1254 rn = 0 1255 for j in range(0, chain.length, 13): 1256 rn += 1 1257 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ] 1258 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format( 1259 rn, chain.id, chain.length, ' '.join(residues) ))
1260
1261 - def add_structure(self, structure):
1262 """ 1263 Append a new model to the file 1264 1265 @type structure: L{Structure} 1266 """ 1267 1268 isnull = self.isnull 1269 1270 for chain_id in structure.chains: 1271 1272 chain = structure.chains[chain_id] 1273 for residue in chain.residues: 1274 1275 atoms = [ ] 1276 for an in residue.atoms: 1277 atom = residue.atoms[an] 1278 if isinstance(atom, csb.bio.structure.DisorderedAtom): 1279 for dis_atom in atom: atoms.append(dis_atom) 1280 else: 1281 atoms.append(atom) 1282 atoms.sort() 1283 1284 for atom in atoms: 1285 1286 alt = atom.alternate 1287 if alt is True: 1288 alt = 'A' 1289 elif alt is False: 1290 alt = ' ' 1291 1292 if atom.element: 1293 element = repr(atom.element) 1294 else: 1295 element = ' ' 1296 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format( 1297 atom.serial_number, atom._full_name, isnull(alt, ' '), 1298 residue._pdb_name, chain.id, 1299 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '), 1300 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.temperature, 0.0), 1301 element, isnull(atom.charge, ' ') )) 1302 1303 self.writeline('TER')
1304
1305 - def finalize(self):
1306 """ 1307 Add the END marker 1308 """ 1309 self.writeline('END') 1310 self._out.flush()
1311
1312 -class PDBEnsembleFileBuilder(PDBFileBuilder):
1313 """ 1314 Supports serialization of NMR ensembles. 1315 1316 Functions as a simple decorator, which wraps C{add_structure} with 1317 MODEL/ENDMDL records. 1318 """ 1319
1320 - def add_structure(self, structure):
1321 1322 model_id = self.isnull(structure.model_id, 1) 1323 1324 self.writeline('MODEL {0:>4}'.format(model_id)) 1325 super(PDBEnsembleFileBuilder, self).add_structure(structure) 1326 self.writeline('ENDMDL')
1327
1328 1329 -class StructureProvider(object):
1330 """ 1331 Base class for all PDB data source providers. 1332 1333 Concrete classes need to implement the C{find} method, which abstracts the 1334 retrieval of a PDB structure file by a structure identifier. This is a hook 1335 method called internally by C{get}, but subclasses can safely override both 1336 C{find} and {get} to in order to achieve completely custom behavior. 1337 """ 1338 1339 __metaclass__ = ABCMeta 1340
1341 - def __getitem__(self, id):
1342 return self.get(id)
1343 1344 @abstractmethod
1345 - def find(self, id):
1346 """ 1347 Attempt to discover a PDB file, given a specific PDB C{id}. 1348 1349 @param id: structure identifier (e.g. 1x80) 1350 @type id: str 1351 @return: path and file name on success, None otherwise 1352 @rtype: str or None 1353 """ 1354 pass
1355
1356 - def get(self, id, model=None):
1357 """ 1358 Discover, parse and return the PDB structure, corresponding to the 1359 specified C{id}. 1360 1361 @param id: structure identifier (e.g. 1x80) 1362 @type id: str 1363 @param model: optional model identifier 1364 @type model: str 1365 1366 @rtype: L{csb.bio.Structure} 1367 1368 @raise StructureNotFoundError: when C{id} could not be found 1369 """ 1370 pdb = self.find(id) 1371 1372 if pdb is None: 1373 raise StructureNotFoundError(id) 1374 else: 1375 return StructureParser(pdb).parse_structure(model=model)
1376
1377 -class FileSystemStructureProvider(StructureProvider):
1378 """ 1379 Simple file system based PDB data source. Scans a list of local directories 1380 using pre-defined file name templates. 1381 1382 @param paths: a list of paths 1383 @type paths: iterable or str 1384 """ 1385
1386 - def __init__(self, paths=None):
1387 1388 self._templates = ['pdb{id}.ent', 'pdb{id}.pdb', '{id}.pdb', '{id}.ent'] 1389 self._paths = csb.core.OrderedDict() 1390 1391 if paths is not None: 1392 if isinstance(paths, csb.core.string): 1393 paths = [paths] 1394 1395 for path in paths: 1396 self.add(path)
1397 1398 @property
1399 - def paths(self):
1400 """ 1401 Current search paths 1402 @rtype: tuple 1403 """ 1404 return tuple(self._paths)
1405 1406 @property
1407 - def templates(self):
1408 """ 1409 Current file name match templates 1410 @rtype: tuple 1411 """ 1412 return tuple(self._templates)
1413
1414 - def add(self, path):
1415 """ 1416 Register a new local C{path}. 1417 1418 @param path: directory name 1419 @type path: str 1420 1421 @raise IOError: if C{path} is not a valid directory 1422 """ 1423 if os.path.isdir(path): 1424 self._paths[path] = path 1425 else: 1426 raise IOError(path)
1427
1428 - def add_template(self, template):
1429 """ 1430 Register a custom file name name C{template}. The template must contain 1431 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent 1432 1433 @param template: pattern 1434 @type template: str 1435 """ 1436 1437 if '{id}' not in template: 1438 raise ValueError('Template does not contain an "{id}" macro') 1439 1440 if template not in self._templates: 1441 self._templates.append(template)
1442
1443 - def remove(self, path):
1444 """ 1445 Unregister an existing local C{path}. 1446 1447 @param path: directory name 1448 @type path: str 1449 1450 @raise ValueError: if C{path} had not been registered 1451 """ 1452 if path not in self._paths: 1453 raise ValueError('path not found: {0}'.format(path)) 1454 1455 del self._paths[path]
1456
1457 - def find(self, id):
1458 1459 for path in self._paths: 1460 for token in self.templates: 1461 fn = os.path.join(path, token.format(id=id)) 1462 if os.path.exists(fn): 1463 return fn 1464 1465 return None
1466
1467 -class RemoteStructureProvider(StructureProvider):
1468 """ 1469 Retrieves PDB structures from a specified remote URL. 1470 The URL requested from remote server takes the form: <prefix>/<ID><suffix> 1471 1472 @param prefix: URL prefix, including protocol 1473 @type prefix: str 1474 @param suffix: optional URL suffix (.ent by default) 1475 @type suffix: str 1476 """ 1477
1478 - def __init__(self, prefix='http://www.rcsb.org/pdb/files/pdb', suffix='.ent'):
1479 1480 self._prefix = None 1481 self._suffix = None 1482 1483 self.prefix = prefix 1484 self.suffix = suffix
1485 1486 @property
1487 - def prefix(self):
1488 """ 1489 Current URL prefix 1490 @rtype: str 1491 """ 1492 return self._prefix
1493 @prefix.setter
1494 - def prefix(self, value):
1495 self._prefix = value
1496 1497 @property
1498 - def suffix(self):
1499 """ 1500 Current URL suffix 1501 @rtype: str 1502 """ 1503 return self._suffix
1504 @suffix.setter
1505 - def suffix(self, value):
1506 self._suffix = value
1507
1508 - def _find(self, id):
1509 1510 try: 1511 return csb.io.urllib.urlopen(self.prefix + id + self.suffix) 1512 except: 1513 raise StructureNotFoundError(id)
1514
1515 - def find(self, id):
1516 1517 stream = self._find(id) 1518 1519 try: 1520 tmp = csb.io.TempFile(dispose=False) 1521 tmp.write(stream.read().decode('utf-8')) 1522 tmp.flush() 1523 return tmp.name 1524 1525 except StructureNotFoundError: 1526 return None 1527 finally: 1528 stream.close()
1529
1530 - def get(self, id, model=None):
1531 1532 stream = self._find(id) 1533 1534 try: 1535 with csb.io.TempFile() as tmp: 1536 tmp.write(stream.read().decode('utf-8')) 1537 tmp.flush() 1538 return StructureParser(tmp.name).parse_structure(model=model) 1539 finally: 1540 stream.close()
1541
1542 -class CustomStructureProvider(StructureProvider):
1543 """ 1544 A custom PDB data source. Functions as a user-defined map of structure 1545 identifiers and their corresponding local file names. 1546 1547 @param files: initialization dictionary of id:file pairs 1548 @type files: dict-like 1549 """ 1550
1551 - def __init__(self, files={}):
1552 1553 self._files = {} 1554 for id in files: 1555 self.add(id, files[id])
1556 1557 @property
1558 - def paths(self):
1559 """ 1560 List of currently registered file names 1561 @rtype: tuple 1562 """ 1563 return tuple(self._files.values())
1564 1565 @property
1566 - def identifiers(self):
1567 """ 1568 List of currently registered structure identifiers 1569 @rtype: tuple 1570 """ 1571 return tuple(self._files)
1572
1573 - def add(self, id, path):
1574 """ 1575 Register a new local C{id}:C{path} pair. 1576 1577 @param id: structure identifier 1578 @type id: str 1579 @param path: path and file name 1580 @type path: str 1581 1582 @raise IOError: if C{path} is not a valid file name 1583 """ 1584 if os.path.isfile(path): 1585 self._files[id] = path 1586 else: 1587 raise IOError(path)
1588
1589 - def remove(self, id):
1590 """ 1591 Unregister an existing structure C{id}. 1592 1593 @param id: structure identifier 1594 @type id: str 1595 1596 @raise ValueError: if C{id} had not been registered 1597 """ 1598 if id not in self._files: 1599 raise ValueError(id) 1600 else: 1601 del self._files[id]
1602
1603 - def find(self, id):
1604 1605 if id in self._files: 1606 return self._files[id] 1607 else: 1608 return None
1609
1610 -def get(accession, model=None, prefix='http://www.rcsb.org/pdb/files/pdb'):
1611 """ 1612 Download and parse a PDB entry. 1613 1614 @param accession: accession number of the entry 1615 @type accession: str 1616 @param model: model identifier 1617 @type model: str 1618 @param prefix: download URL prefix 1619 @type prefix: str 1620 1621 @return: object representation of the selected model 1622 @rtype: L{Structure} 1623 """ 1624 return RemoteStructureProvider(prefix).get(accession, model=model)
1625
1626 -def find(id, paths):
1627 """ 1628 Try to discover a PDB file for PDB C{id} in C{paths}. 1629 1630 @param id: PDB ID of the entry 1631 @type id: str 1632 @param paths: a list of directories to scan 1633 @type paths: list of str 1634 1635 @return: path and file name on success, None otherwise 1636 @rtype: str 1637 """ 1638 return FileSystemStructureProvider(paths).find(id)
1639
1640 1641 -class AsyncParseResult(object):
1642
1643 - def __init__(self, result, exception):
1644 1645 self.result = result 1646 self.exception = exception
1647
1648 - def __repr__(self):
1649 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
1650
1651 1652 -def _parse_async(parser, file, model):
1653 p = parser(file) 1654 return p.parse_structure(model)
1655
1656 1657 -class AsyncStructureParser(object):
1658 """ 1659 Wraps StructureParser in an asynchronous call. Since a new process is 1660 started by Python internally (as opposed to only starting a new thread), 1661 this makes the parser slower, but provides a way to set a parse timeout 1662 limit. 1663 1664 If initialized with more than one worker, supports parallel parsing 1665 through the C{self.parse_async} method. 1666 1667 @param workers: number of worker threads (1 by default) 1668 @type workers: int 1669 """ 1670
1671 - def __init__(self, workers=1):
1672 1673 self._pool = None 1674 self._workers = 1 1675 1676 if int(workers) > 0: 1677 self._workers = int(workers) 1678 else: 1679 raise ValueError(workers) 1680 1681 self._recycle()
1682
1683 - def _recycle(self):
1684 1685 if self._pool: 1686 self._pool.terminate() 1687 1688 self._pool = multiprocessing.Pool(processes=self._workers)
1689
1690 - def parse_structure(self, structure_file, timeout, model=None, 1691 parser=RegularStructureParser):
1692 """ 1693 Call StructureParser.parse_structure() in a separate process and return 1694 the output. Raise TimeoutError if the parser does not respond within 1695 C{timeout} seconds. 1696 1697 @param structure_file: structure file to parse 1698 @type structure_file: str 1699 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 1700 elapse before the parser completes its job 1701 @type timeout: int 1702 @param parser: any implementing L{AbstractStructureParser} class 1703 @type parser: type 1704 1705 @return: parsed structure 1706 @rtype: L{csb.structure.Structure} 1707 """ 1708 1709 r = self.parse_async([structure_file], timeout, model, parser) 1710 if len(r) > 0: 1711 if r[0].exception is not None: 1712 raise r[0].exception 1713 else: 1714 return r[0].result 1715 return None
1716
1717 - def parse_async(self, structure_files, timeout, model=None, 1718 parser=RegularStructureParser):
1719 """ 1720 Call C{self.parse_structure} for a list of structure files 1721 simultaneously. The actual degree of parallelism will depend on the 1722 number of workers specified while constructing the parser object. 1723 1724 @note: Don't be tempted to pass a large list of structures to this 1725 method. Every time a C{TimeoutError} is encountered, the 1726 corresponding worker process in the pool will hang until the 1727 process terminates on its own. During that time, this worker is 1728 unusable. If a sufficiently high number of timeouts occur, the 1729 whole pool of workers will be unsable. At the end of the method 1730 however a pool cleanup is performed and any unusable workers 1731 are 'reactivated'. However, that only happens at B{the end} of 1732 C{parse_async}. 1733 1734 @param structure_files: a list of structure files 1735 @type structure_files: tuple of str 1736 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 1737 elapse before the parser completes its job 1738 @type timeout: int 1739 @param parser: any implementing L{AbstractStructureParser} class 1740 @type parser: type 1741 1742 @return: a list of L{AsyncParseResult} objects 1743 @rtype: list 1744 """ 1745 1746 pool = self._pool 1747 workers = [] 1748 results = [] 1749 1750 for file in list(structure_files): 1751 result = pool.apply_async(_parse_async, [parser, file, model]) 1752 workers.append(result) 1753 1754 hanging = False 1755 for w in workers: 1756 result = AsyncParseResult(None, None) 1757 try: 1758 result.result = w.get(timeout=timeout) 1759 except KeyboardInterrupt as ki: 1760 pool.terminate() 1761 raise ki 1762 except Exception as ex: 1763 result.exception = ex 1764 if isinstance(ex, multiprocessing.TimeoutError): 1765 hanging = True 1766 results.append(result) 1767 1768 if hanging: 1769 self._recycle() 1770 1771 return results
1772