Package csb :: Package bio :: Package io :: Module hhpred
[frames] | no frames]

Source Code for Module csb.bio.io.hhpred

  1  """ 
  2  HHpred-related format parsers. 
  3   
  4  This module defines two HHpred format parsers: L{HHProfileParser} and 
  5  L{HHOutputParser}. The first one is used to read HMM (*.hhm) files: 
  6   
  7      >>> HHProfileParser("profile.hhm", "profile.pdb").parse() 
  8      <ProfileHMM>            # ProfileHMM object 
  9       
 10  while the latter parses HHsearch results files (*.hhr): 
 11   
 12      >>> HHOutputParser().parse_file("hits.hhr"): 
 13      <HHpredHitList>        # collection of HHpredHit-s 
 14       
 15  See L{ProfileHMM}, L{HHpredHitList} and L{HHpredHit} from L{csb.bio.hmm} 
 16  for details. For text serialization of HMM profiles, see L{HHMFileBuilder} 
 17  in this module. 
 18  """ 
 19   
 20  import os 
 21  import re 
 22   
 23  import csb.io 
 24  import csb.bio.io 
 25  import csb.bio.structure as structure 
 26   
 27  from csb.core import Enum, CollectionIndexError, ItemNotFoundError 
 28  from csb.bio.sequence import Sequence, ProteinAlphabet, A3MAlignment 
 29   
 30  from csb.bio.hmm import State, Transition, ProfileHMM, HMMLayer, ProfileLength 
 31  from csb.bio.hmm import HHpredHitList, HHpredHit, ScoreUnits, States, EVDParameters 
 32  from csb.bio.hmm import StateNotFoundError, TransitionNotFoundError 
33 34 35 -class HHProfileFormatError(ValueError):
36 pass
37
38 -class StructureFormatError(HHProfileFormatError):
39 pass
40
41 -class HHOutputFormatError(HHProfileFormatError):
42 pass
43
44 45 -class HHProfileParser(object):
46 """ 47 A class that is HHpred HMM format aware. 48 Produces a L{ProfileHMM} object representation of a given HHpred profile 49 HMM. 50 51 @param hhm_file: *.hhm file name to parse 52 @type hhm_file: str 53 @param pdb_file: an optional hhm structure file, containing the structural 54 data, associated with the HMM. 55 @type pdb_file: str 56 @note: This is NOT a regular PDB file! It must be specifically formatted 57 for use with HHpred. 58 59 @raise IOError: if any of the files does not exist 60 """ 61
62 - def __init__(self, hhm_file, pdb_file=None):
63 64 if not os.path.exists(hhm_file): 65 raise IOError("Could not read HHM file {0}".format(hhm_file)) 66 67 if pdb_file: 68 if not os.path.exists(pdb_file): 69 raise IOError("Could not read structure file {0}".format(pdb_file)) 70 71 self._file = hhm_file 72 self._pdb = pdb_file 73 self._properties = None 74 self._sequences = None 75 self._profile = None 76 77 self._chopped = False 78 self._chop()
79
80 - def format_structure(self, input_pdb, chain_id, output_file):
81 """ 82 Format input_pdb as an HHpred pre-parsed structure file for the current 83 HMM. Formatting means: only chain_id left in the PDB file, residue 84 sequence numbers strictly corresponding to the HMM states' ranks. 85 86 @param input_pdb: source PDB file name 87 @type input_pdb: str 88 @param chain_id: source chain ID (which chain to pick from the 89 structure) 90 @type chain_id: str 91 @param output_file: save the output PDB file here 92 @type output_file: str 93 94 @raise StructureFormatError: when the specified PDB chain does not 95 correspond to the HMM. 96 """ 97 98 hmm = self.parse() 99 s = csb.bio.io.StructureParser(input_pdb).parse_structure() 100 chain = s.chains[chain_id] 101 102 if s.first_chain.length != hmm.layers.length: 103 raise StructureFormatError( 104 "{0}: Incorrect number of residues".format(chain.entry_id)) 105 106 for layer, residue in zip(hmm.layers, chain.residues): 107 108 if residue.type == ProteinAlphabet.UNK: 109 residue.type = layer.residue.type 110 111 if residue.type != layer.residue.type: 112 msg = "Expected residue of type {0} at position {1}" 113 raise StructureFormatError(msg.format(layer.residue.type, layer.rank)) 114 115 residue.id = str(residue.rank), '$' 116 117 for residue in chain.residues: 118 residue.id = str(residue.rank), None 119 120 new_structure = structure.Structure(hmm.id) 121 new_structure.chains.append(chain.clone()) 122 123 new_structure.to_pdb(output_file)
124
125 - def parse(self, units=ScoreUnits.LogScales):
126 """ 127 Parse the HMM profile. 128 129 @param units: also convert the profile score units to the specified 130 L{csb.bio.hmm.ScoreUnits} kind 131 @type units: L{csb.core.EnumItem} 132 133 @return: a L{ProfileHMM} instance 134 @rtype: L{ProfileHMM} 135 136 @raise HHProfileFormatError: when the hhm file is invalid/corrupt 137 """ 138 assert self._chopped 139 140 hmm = ProfileHMM(units=units, scale=-1000.0, logbase=2) 141 if self._profile: 142 self._parse_profile(hmm, units) 143 else: 144 raise HHProfileFormatError('Missing HMM profile table.') 145 146 if self._properties: 147 self._parse_properties(hmm) 148 else: 149 raise HHProfileFormatError('No profile properties found.') 150 151 if self._sequences: 152 self._parse_sequences(hmm) 153 else: 154 raise HHProfileFormatError('No profile MSA and secondary structure found.') 155 156 if hmm.dssp: 157 hmm._assign_secstructure() 158 159 if self._pdb: 160 self._parse_structure(hmm) 161 162 return hmm
163
164 - def _chop(self):
165 """ 166 Chop the HHM file into pieces - HMM properties, secondary structure + 167 MSA, HMM. 168 """ 169 170 try: 171 with open(self._file, 'r') as f: 172 pr = csb.io.EntryReader(f, 'HHsearch', 'SEQ') 173 self._properties = pr.readall()[0].splitlines() 174 175 with open(self._file, 'r') as f: 176 sr = csb.io.EntryReader(f, '>', '#') 177 self._sequences = sr.readall() 178 179 with open(self._file, 'r') as f: 180 hr = csb.io.EntryReader(f, '#', '//') 181 self._profile = hr.readall()[0].splitlines() 182 183 self._chopped = True 184 185 except IndexError: 186 raise HHProfileFormatError('Corrupt HHM file.')
187
188 - def _parse_properties(self, hmm):
189 """ 190 Parse the profile properties 191 192 @param hmm: the hmm object being constructed 193 @type hmm: L{ProfileHMM} 194 @return: the updated hmm 195 @rtype: hmm 196 197 @raise NotImplementedError: if an unexpected data field is encountered 198 """ 199 assert self._chopped 200 201 for line in self._properties: 202 203 if line.startswith('NAME '): 204 hmm.name = line[6:].strip() 205 206 elif line.startswith('FAM '): 207 hmm.family = line[6:].strip() 208 209 elif line.startswith('LENG '): 210 m = re.search('(\d+)\D+(\d+)', line).groups() 211 m = tuple(map(int, m)) 212 hmm.length = ProfileLength(m[0], m[1]) 213 214 elif line.startswith('FILE '): 215 hmm.id = line[6:].strip() 216 217 elif line.startswith('HHsearch'): 218 hmm.version = float(line[9:]) 219 220 elif line.startswith('NEFF '): 221 hmm.effective_matches = float(line[6:]) 222 223 elif line.startswith('PCT '): 224 if line[6:].strip().lower() == 'true': 225 hmm.pseudocounts = True 226 227 elif line.startswith('EVD '): 228 lamda, mu = map(float, line[6:].split()) 229 hmm.evd = EVDParameters(lamda, mu) 230 231 elif line[:5] in ('COM ', 'DATE ', 'FILT '): 232 pass 233 else: 234 raise NotImplementedError("Unexpected line: {0}.".format(line[0:5])) 235 236 if not hmm.id and hmm.name: 237 hmm.id = hmm.name.split()[0] 238 return hmm
239
240 - def _parse_sequences(self, hmm):
241 """ 242 Parse secondary structure and multiple alignment sections. 243 244 @param hmm: the hmm object being constructed 245 @type hmm: L{ProfileHMM} 246 @return: the updated hmm 247 @rtype: hmm 248 """ 249 assert self._chopped 250 251 psipred = None 252 msa_entries = [] 253 254 for entry in self._sequences: 255 256 header_token = entry[:8] 257 if header_token in ['>ss_dssp', '>sa_dssp', '>ss_pred', '>ss_conf', '>Consens']: 258 259 lines = entry.strip().splitlines() 260 seq = re.sub('\s+', '', ''.join(lines[1:])) 261 262 if header_token == '>ss_dssp': 263 hmm.dssp = structure.SecondaryStructure(seq) 264 elif header_token == '>sa_dssp': 265 hmm.dssp_solvent = seq 266 elif header_token == '>ss_pred': 267 psipred = seq 268 elif header_token == '>ss_conf': 269 conf = seq 270 hmm.psipred = structure.SecondaryStructure(psipred, conf) 271 elif header_token == '>Consens': 272 hmm.consensus = Sequence('Consensus', 'Consensus', seq) 273 else: 274 msa_entries.append(entry) 275 276 if msa_entries: 277 msa = '\n'.join(msa_entries) 278 hmm.alignment = A3MAlignment.parse(msa, strict=False) 279 280 return hmm
281
282 - def _parse_profile(self, hmm, units=ScoreUnits.LogScales):
283 """ 284 Parse the HMM profile. 285 286 @param hmm: the hmm object being constructed 287 @type hmm: L{ProfileHMM} 288 @return: the updated hmm 289 @rtype: L{ProfileHMM} 290 291 @raise NotImplementedError: when an unknown transition string is 292 encountered 293 """ 294 assert self._chopped 295 296 # 0. Prepare start and end states 297 hmm.start = State(States.Start) 298 hmm.end = State(States.End) 299 300 residues = None 301 background = {} 302 tran_types = None 303 tran_lines = [] 304 start_probs = None 305 306 lines = iter(self._profile) 307 pattern = re.compile('^[A-Z\-]\s[0-9]+\s+') 308 309 if units == ScoreUnits.LogScales: 310 311 def parse_probability(v): 312 if v.strip() == '*': 313 return None 314 else: 315 return float(v)
316 else: 317 318 def parse_probability(v): 319 if v.strip() == '*': 320 return None 321 else: 322 return hmm._convert(units, float(v), 323 hmm.scale, hmm.logbase)
324 325 # 1. Create all layers (profile columns), create and attach their match states 326 327 while True: 328 try: 329 line = next(lines) 330 except StopIteration: 331 break 332 333 if line.startswith('NULL'): 334 try: 335 backprobs = tuple(map(parse_probability, line.split()[1:])) 336 337 line = next(lines) 338 residues = line.split()[1:] 339 residues = [Enum.parse(ProteinAlphabet, aa) for aa in residues] 340 341 for pos, aa in enumerate(residues): 342 background[aa] = backprobs[pos] 343 344 line = next(lines) 345 tran_types = line.split() 346 347 line = next(lines) 348 start_probs = list(map(parse_probability, line.split())) 349 except StopIteration: 350 break 351 352 elif re.match(pattern, line): 353 emrow = line 354 try: 355 tran_lines.append(next(lines)) 356 #junkrow = next(lines) 357 except StopIteration: 358 break 359 360 emprobs = emrow.split() 361 if len(emprobs) != 23: 362 raise HHProfileFormatError( 363 "Unexpected number of data fields: {0}".format(len(emprobs))) 364 365 rank = int(emprobs[1]) 366 residue = structure.ProteinResidue( 367 rank=rank, type=emprobs[0], sequence_number=rank, insertion_code=None) 368 if residue.type == ProteinAlphabet.GAP: 369 raise HHProfileFormatError("Layer {0} can't be represented by a gap".format(rank)) 370 371 new_layer = hmm.layers.append(HMMLayer(rank, residue)) 372 if new_layer != rank: 373 raise HHProfileFormatError('Layer {0} defined as {1}'.format(new_layer, rank)) 374 375 match = State(States.Match, emit=Enum.members(ProteinAlphabet)) 376 377 match.rank = rank 378 match.background.set(background) 379 380 for col, aa in enumerate(residues): 381 prob = parse_probability(emprobs[col + 2]) 382 match.emission.append(aa, prob) 383 384 hmm.layers[new_layer].append(match) 385 assert hmm.layers.last_index == match.rank 386 387 # 2. Append starting transitions: S -> M[1] and optionally S -> D[1] and S -> I[0]. 388 # States D[1] and I[0] will be created if needed 389 # Note that [0] is not a real layer, I[0] is simply an insertion at the level of Start 390 if len(hmm.layers) > 0: 391 392 first_match = hmm.layers[hmm.layers.start_index] 393 394 if start_probs[0] is None: 395 raise HHProfileFormatError("Transition Start > Match[1] is undefined") 396 397 start_tran = Transition(hmm.start, first_match[States.Match], start_probs[0]) 398 hmm.start.transitions.append(start_tran) 399 400 if start_probs[1] is not None and start_probs[3] is not None: # Start -> I[0] -> M[1] 401 start_ins = State(States.Insertion, emit=Enum.members(ProteinAlphabet)) 402 start_ins.rank = 0 403 start_ins.background.set(background) 404 start_ins.emission = start_ins.background 405 406 hmm.start_insertion = start_ins 407 # Start -> I[0] 408 hmm.start.transitions.append( 409 Transition(hmm.start, hmm.start_insertion, start_probs[1])) 410 # I[0] -> M[1] 411 hmm.start_insertion.transitions.append( 412 Transition(hmm.start_insertion, first_match[States.Match], start_probs[3])) 413 # I[0] -> I[0] 414 if start_probs[4]: 415 hmm.start_insertion.transitions.append( 416 Transition(hmm.start_insertion, hmm.start_insertion, start_probs[4])) 417 418 419 if start_probs[2] is None and start_probs[6] is not None: 420 # M->D is corrupt (*) at the Start layer, using D->D instead 421 start_probs[2] = start_probs[6] 422 423 if start_probs[2] is not None: # Start -> D[1] 424 start_del = State(States.Deletion) 425 start_del.rank = 1 426 hmm.layers[1].append(start_del) 427 start_tran = Transition(hmm.start, first_match[States.Deletion], start_probs[2]) 428 hmm.start.transitions.append(start_tran) 429 else: 430 start_tran = Transition(hmm.start, hmm.end, start_probs[0]) 431 hmm.start.transitions.append(start_tran) 432 433 434 # 3. Append remaining transitions. I and D states will be created on demand. 435 436 for rank, fields in enumerate(tran_lines, start=hmm.layers.start_index): 437 assert hmm.layers[rank][States.Match].rank == rank 438 439 ofields = fields.split() 440 fields = tuple(map(parse_probability, ofields)) 441 442 # 3a. Parse all Neff values and create I[i] and D[i] states if NeffX[i] is not None 443 for col, neff in enumerate(tran_types[7:10], start=7): 444 445 if fields[col] is not None: 446 neff_value = float(ofields[col]) / abs(hmm.scale) 447 448 if neff == 'Neff': 449 hmm.layers[rank].effective_matches = neff_value 450 451 elif neff == 'Neff_I': 452 hmm.layers[rank].effective_insertions = neff_value 453 454 if States.Insertion not in hmm.layers[rank]: 455 insertion = State(States.Insertion, emit=Enum.members(ProteinAlphabet)) 456 insertion.background.set(background) 457 insertion.emission.set(background) 458 insertion.rank = rank 459 hmm.layers[rank].append(insertion) 460 461 elif neff == 'Neff_D': 462 hmm.layers[rank].effective_deletions = neff_value 463 464 if States.Deletion not in hmm.layers[rank] and neff_value > 0: 465 deletion = State(States.Deletion) 466 deletion.rank = rank 467 hmm.layers[rank].append(deletion) 468 469 # 3b. Starting from the first layer, parse all transitions and build the HMM graph stepwise 470 for col, tran in enumerate(tran_types): 471 probability = fields[col] 472 473 if probability is not None: 474 try: 475 self._add_transition(hmm, rank, tran, probability) 476 except (CollectionIndexError, ItemNotFoundError) as ex: 477 msg = "Can't add transition {0} at {1}: {2.__class__.__name__}, {2!s}" 478 raise HHProfileFormatError(msg.format(tran, rank, ex)) 479 480 return hmm 481
482 - def _add_transition(self, hmm, rank, tran, probability):
483 484 match = hmm.layers[rank][States.Match] 485 nextmatch = None 486 if rank < hmm.layers.last_index: 487 nextmatch = hmm.layers[rank + 1][States.Match] 488 else: 489 nextmatch = hmm.end 490 491 if tran == 'M->M': 492 transition = Transition(match, nextmatch, probability) 493 match.transitions.append(transition) 494 495 elif tran == 'M->I': 496 insertion = hmm.layers[rank][States.Insertion] 497 transition = Transition(match, insertion, probability) 498 match.transitions.append(transition) 499 500 elif tran == 'M->D': 501 deletion = State(States.Deletion) 502 deletion.rank = rank + 1 503 hmm.layers[rank + 1].append(deletion) 504 transition = Transition(match, deletion, probability) 505 match.transitions.append(transition) 506 507 elif tran == 'I->M': 508 insertion = hmm.layers[rank][States.Insertion] 509 transition = Transition(insertion, nextmatch, probability) 510 insertion.transitions.append(transition) 511 512 elif tran == 'I->I': 513 insertion = hmm.layers[rank][States.Insertion] 514 selfloop = Transition(insertion, insertion, probability) 515 insertion.transitions.append(selfloop) 516 517 elif tran == 'D->M': 518 deletion = hmm.layers[rank][States.Deletion] 519 transition = Transition(deletion, nextmatch, probability) 520 deletion.transitions.append(transition) 521 522 elif tran == 'D->D': 523 deletion = hmm.layers[rank][States.Deletion] 524 525 if States.Deletion not in hmm.layers[rank + 1]: 526 nextdeletion = State(States.Deletion) 527 nextdeletion.rank = rank + 1 528 hmm.layers[rank + 1].append(nextdeletion) 529 530 else: 531 nextdeletion = hmm.layers[rank + 1][States.Deletion] 532 assert match.transitions[States.Deletion].successor == nextdeletion 533 534 transition = Transition(deletion, nextdeletion, probability) 535 deletion.transitions.append(transition) 536 537 else: 538 if not tran.startswith('Neff'): 539 raise NotImplementedError('Unknown transition: {0}'.format(tran))
540
541 - def _parse_structure(self, hmm):
542 """ 543 Add structural information to an existing HMM. 544 545 @param hmm: the hmm object being constructed 546 @type hmm: L{ProfileHMM} 547 @return: the updated hmm 548 @rtype: L{ProfileHMM} 549 550 @raise ValueError: if the structure file does not refer to the HMM 551 @raise HHProfileFormatError: when a residue cannot be assigned to an HMM layer 552 @raise NotImplementedError: when an unknown data field is encountered 553 in the PDB file 554 """ 555 556 assert self._pdb 557 558 s = csb.bio.io.StructureParser(self._pdb).parse_structure() 559 560 if s.chains.length != 1: 561 raise StructureFormatError("Must contain exactly one chain") 562 if s.first_chain.length != hmm.layers.length: 563 raise StructureFormatError("Incorrect number of residues") 564 565 chain = s.first_chain 566 chain.compute_torsion() 567 568 for layer, residue in zip(hmm.layers, chain.residues): 569 570 if residue.type != layer.residue.type: 571 msg = "Expected residue of type {0} at position {1}" 572 raise StructureFormatError(msg.format(layer.residue.type, layer.rank)) 573 574 layer.residue.torsion = residue.torsion.copy() 575 576 for atom in residue.items: 577 layer.residue.atoms.append(atom.clone()) 578 579 return hmm
580 581 HHpredProfileParser = HHProfileParser
582 583 584 -class HHMFileBuilder(object):
585 """ 586 Builder for HHpred's hhm files. 587 588 @param output: destination stream 589 @type output: file 590 """ 591
592 - def __init__(self, output):
593 594 if not hasattr(output, 'write'): 595 raise TypeError(output) 596 597 self._out = output
598 599 @property
600 - def output(self):
601 """ 602 Destination stream 603 @rtype: stream 604 """ 605 return self._out
606
607 - def write(self, data):
608 self._out.write(data)
609
610 - def writeline(self, data):
611 self.write(data) 612 self.write('\n')
613
614 - def add_hmm(self, hmm):
615 """ 616 Append a new HMM to the destination stream. 617 618 @param hmm: profile HMM to serialize 619 @type hmm: L{ProfileHMM} 620 """ 621 622 if hmm.score_units != ScoreUnits.LogScales: 623 raise ValueError('Scores must be converted to LogScales first.') 624 625 self.writeline('''HHsearch {0.version} 626 NAME {0.name} 627 FAM {0.family} 628 LENG {0.length.matches} match states, {0.length.layers} columns in multiple alignment 629 NEFF {0.effective_matches} 630 PCT {0.pseudocounts}'''.format(hmm)) 631 if hmm.evd: 632 self.writeline('EVD {0.lamda} {0.mu}'.format(hmm.evd)) 633 634 self.writeline('SEQ') 635 if hmm.dssp: 636 self.writeline('>ss_dssp') 637 self.writeline(hmm.dssp.to_string()) 638 if hmm.dssp_solvent: 639 self.writeline('>sa_dssp') 640 self.writeline(hmm.dssp_solvent) 641 if hmm.psipred: 642 self.writeline('>ss_pred') 643 self.writeline(hmm.psipred.to_string()) 644 self.writeline('>ss_conf') 645 confidence = [''.join(map(str, m.score)) for m in hmm.psipred] 646 self.writeline(''.join(confidence)) 647 648 if hmm.alignment: 649 if hmm.consensus: 650 self.writeline(str(hmm.consensus)) 651 self.writeline(hmm.alignment.format().rstrip('\r\n')) 652 653 self.writeline('#') 654 655 first_match = hmm.layers[1][States.Match] 656 null = [int(first_match.background[aa]) 657 for aa in sorted(map(str, first_match.background))] 658 self.writeline('NULL {0}'.format('\t'.join(map(str, null)))) 659 self.writeline('HMM {0}'.format( 660 '\t'.join(sorted(map(str, first_match.emission))))) 661 662 tran_types = 'M->M M->I M->D I->M I->I D->M D->D'.split() 663 self.writeline(' {0}'.format( 664 '\t'.join(tran_types + 'Neff Neff_I Neff_D'.split()))) 665 666 self.write(" ") 667 for tran_type in tran_types: 668 source_statekind = Enum.parse(States, tran_type[0]) 669 target_statekind = Enum.parse(States, tran_type[3]) 670 if source_statekind == States.Match: 671 try: 672 self.write("{0:<7}\t".format( 673 int(hmm.start.transitions[target_statekind].probability))) 674 except TransitionNotFoundError: 675 self.write("*\t") 676 else: 677 self.write("*\t") 678 self.writeline('*\t' * 3) 679 680 for layer in hmm.layers: 681 682 self.write("{0} {1:<5}".format(layer.residue.type, layer.rank)) 683 for aa in sorted(layer[States.Match].emission): 684 emission = layer[States.Match].emission[aa] 685 if emission is None: 686 emission = '*' 687 else: 688 emission = int(emission) 689 self.write("{0:<7}\t".format(emission)) 690 self.writeline("{0}".format(layer.rank)) 691 692 self.write(" ") 693 for tran_type in tran_types: 694 source_statekind = Enum.parse(States, tran_type[0]) 695 target_statekind = Enum.parse(States, tran_type[3]) 696 697 if target_statekind == States.Match and layer.rank == hmm.layers.last_index: 698 target_statekind = States.End 699 700 try: 701 state = layer[source_statekind] 702 self.write("{0:<7}\t".format( 703 int(state.transitions[target_statekind].probability))) 704 except StateNotFoundError: 705 self.write("*\t") 706 707 for data in (layer.effective_matches, layer.effective_insertions, 708 layer.effective_deletions): 709 if data is None: 710 data = '*' 711 else: 712 data = int(data * abs(hmm.scale)) 713 self.write("{0:<7}\t".format(data)) 714 715 self.writeline("\n") 716 717 self.writeline('//')
718
719 720 -class HHOutputParser(object):
721 """ 722 Parser for HHsearch result (*.hhr) files. 723 724 @param alignments: if set to False, the parser will skip the 725 alignments section of the file 726 @type alignments: bool 727 """ 728
729 - def __init__(self, alignments=True):
730 731 self._alignments = True 732 self.alignments = alignments
733
734 - def __repr__(self):
735 return "<HHsearch Result Parser>"
736 737 @property
738 - def alignments(self):
739 """ 740 True if hit alignments will be parsed 741 @rtype: bool 742 """ 743 return self._alignments
744 @alignments.setter
745 - def alignments(self, value):
746 self._alignments = bool(value)
747
748 - def parse_file(self, hhr_file, header_only=False):
749 """ 750 Parse all hits from this HHpred result file. 751 752 @param hhr_file: input HHR file name 753 @type hhr_file: str 754 755 @return: parsed hits 756 @rtype: HHpredHitList 757 758 @raise HHOutputFormatError: if the hhr file is corrupt 759 """ 760 761 with open(os.path.expanduser(hhr_file)) as stream: 762 return self._parse(stream, header_only)
763
764 - def parse_string(self, output, header_only=False):
765 """ 766 Get all hits from an C{output} string. 767 768 @param output: HHpred standard output 769 @type output: str 770 771 @return: parsed hits 772 @rtype: HHpredHitList 773 774 @raise HHOutputFormatError: if the output is corrupt 775 """ 776 stream = csb.io.MemoryStream() 777 stream.write(output) 778 stream.seek(0) 779 780 return self._parse(stream, header_only)
781
782 - def _parse(self, stream, header_only):
783 784 qlen = None 785 in_hits = False 786 in_alis = False 787 has_alis = False 788 c_rank = 0 789 header = {} 790 hits = {} 791 alis = {} 792 793 for line in stream: 794 795 if not in_hits and not in_alis: 796 797 if line.replace(' ', '').startswith('NoHitProbE-value'): 798 in_hits = True 799 continue 800 elif line.strip() == '': 801 continue 802 else: # parse header data (stuff above the hits table) 803 columns = line.strip().split(None, 1) 804 if len(columns) == 2: 805 806 identifier, data = columns 807 if identifier in ('Query', 'Command'): 808 data = data.strip() 809 elif identifier == 'Neff': 810 data = float(data) 811 elif identifier in ('Searched_HMMs', 'Match_columns'): 812 data = int(data) 813 814 header[identifier] = data 815 816 if identifier == 'Match_columns': 817 qlen = data 818 819 if in_hits and not header_only: 820 if not line.strip(): # suboptimal way to handle block switch 821 in_hits = False 822 in_alis = True 823 if self.alignments: 824 continue 825 else: 826 break 827 elif line.strip() == 'Done': 828 in_hits = False 829 in_alis = False 830 break 831 832 description = line[:34].split() 833 rank = int(description[0]) 834 id = description[1] 835 836 pos = line[85:94].strip() 837 start, end = map(int, pos.split('-')) 838 839 qpos = line[75:84].strip() 840 qstart, qend = map(int, qpos.split('-')) 841 842 probability = float(line[35:40]) / 100.0 843 844 hit = HHpredHit(rank, id, start, end, qstart, qend, probability, qlen) 845 846 hit.evalue = float(line[41:48]) 847 hit.pvalue = float(line[49:56]) 848 hit.score = float(line[57:63]) 849 hit.ss_score = float(line[64:69]) 850 851 hit.slength = int(line[94:].replace('(', '').replace(')', '')) 852 853 hits[hit.rank] = hit 854 alis[hit.rank] = {'q': [], 's': []} 855 856 elif in_alis and not header_only: 857 if line.startswith('Done'): 858 in_alis = False 859 break 860 861 elif line.startswith('No '): 862 c_rank = int(line[3:]) 863 if c_rank not in hits: 864 raise HHOutputFormatError('Alignment {0}. refers to a non-existing hit'.format(c_rank)) 865 866 elif line.startswith('>'): 867 hits[c_rank].name = line[1:].strip() 868 869 elif line.startswith('Probab='): 870 for pair in line.split(): 871 key, value = pair.split('=') 872 if key == 'Identities': 873 hits[c_rank].identity = float( 874 value.replace('%', '')) 875 elif key == 'Similarity': 876 hits[c_rank].similarity = float(value) 877 elif key == 'Sum_probs': 878 hits[c_rank].prob_sum = float(value) 879 880 elif line.startswith('Q ') and not line[:11].rstrip() in ('Q Consensus', 'Q ss_pred','Q ss_conf', 'Q ss_dssp'): 881 for residue in line[22:]: 882 if residue.isspace() or residue.isdigit(): 883 break 884 else: 885 alis[c_rank]['q'].append(residue) 886 has_alis = True 887 888 elif line.startswith('T ') and not line[:11].rstrip() in ('T Consensus', 'T ss_pred','T ss_conf', 'T ss_dssp'): 889 for residue in line[22:]: 890 if residue.isspace() or residue.isdigit(): 891 break 892 else: 893 alis[c_rank]['s'].append(residue) 894 895 if self.alignments and has_alis: 896 for rank in alis: 897 try: 898 hits[rank].add_alignment(alis[rank]['q'], alis[rank]['s']) 899 900 except (KeyError, ValueError) as er: 901 raise HHOutputFormatError('Corrupt alignment at hit No {0}.\n {1}'.format(rank, er)) 902 903 del alis 904 905 hits = HHpredHitList(hits.values()) 906 907 hits.sort() 908 909 ## add data obtained from the header to the HHpredHitList 910 for identifier, data in header.items(): 911 if identifier == 'Query': 912 hits.query_name = data 913 elif identifier == 'Match_columns': 914 hits.match_columns = data 915 elif identifier == 'No_of_seqs': 916 hits.no_of_seqs = data 917 elif identifier == 'Neff': 918 hits.neff = data 919 elif identifier == 'Searched_HMMs': 920 hits.searched_hmms = data 921 elif identifier == 'Date': 922 hits.date = data 923 elif identifier == 'Command': 924 hits.command = data 925 926 return hits
927 928 HHpredOutputParser = HHOutputParser 929