Package csb :: Package bio :: Package fragments
[frames] | no frames]

Source Code for Package csb.bio.fragments

   1  """ 
   2  APIs for working with protein structure fragments and libraries. 
   3   
   4  This package contains the nuts and bolts of HHfrag. Everything here revolves 
   5  around the L{Target} class, which describes a protein structure prediction 
   6  target. One typically assigns fragments (L{Assignment}s) to the target and then 
   7  builds a fragment library with L{RosettaFragsetFactory}. 
   8   
   9  @note: Internal or legacy objects are intentionally left undocumented. 
  10         This typically indicates experimental code. 
  11  """ 
  12   
  13  import os 
  14  import numpy 
  15   
  16  import csb.io 
  17  import csb.core 
  18  import csb.bio.utils 
  19  import csb.bio.structure 
  20  import csb.bio.sequence 
  21   
  22  from csb.bio.structure import SecondaryStructure 
23 24 25 -class FragmentTypes(object):
26 27 ISites = 'IS' 28 HMMFragments = 'HH' 29 HHThread = 'TH' 30 HHfrag = HHThread 31 Rosetta = 'NN'
32
33 -class Metrics(object):
34 35 RMSD = 'rmsd_to' 36 NORMALIZED_RMSD = 'nrmsd_to' 37 MDA = 'mda_to'
38 39 RANDOM_RMSD = { 5: 1.8749005857255376, 6: 2.4314283686276261, 7: 2.9021135267789608, 8: 3.2477716200172715, 9: 3.5469606556031708, 10: 3.8295465524456329, 40 11: 4.1343107114131783, 12: 4.3761697929053014, 13: 4.6707299668248394, 14: 4.9379016881069733, 15: 5.1809028645084911, 16: 5.4146957142595662, 41 17: 5.7135948448156988, 18: 5.9597935432566782, 19: 6.1337340535741962, 20: 6.3962825155503271, 21: 6.6107937773415166, 22: 6.8099096274123401, 42 23: 7.0435583846849639, 24: 7.2160956482560970, 25: 7.4547896324594962, 26: 7.6431870072434211, 27: 7.8727812194173836, 28: 8.0727393298443637, 43 29: 8.2551450998965326, 30: 8.4413583511786587, 31: 8.5958719774122052, 32: 8.7730435506242408, 33: 8.9970648837941649, 34: 9.1566521405105163, 44 35: 9.2828620878454728, 36: 9.4525824357923405, 37: 9.6322126445253300, 38: 9.7851684750961176, 39: 9.9891454649821476, 40: 10.124373939352028, 45 41: 10.284348528344765, 42: 10.390457305096271, 43: 10.565792044674239, 44: 10.676532740033737, 45: 10.789537132283652, 46: 11.004475543757550, 46 47: 11.064541647783571, 48: 11.231219875286985, 49: 11.319222637391441, 50: 11.485478165340824, 51: 11.607522494435521, 52: 11.700268836069840, 47 53: 11.831245255954073, 54: 11.918975893263905 }
48 49 -class FragmentMatch(object):
50 """ 51 Base class, representing a match between a fragment and its target. 52 """ 53
54 - def __init__(self, id, qstart, qend, probability, rmsd, tm_score, qlength):
55 56 self._id = id 57 self._qstart = qstart 58 self._qend = qend 59 self._probability = probability 60 self._rmsd = rmsd 61 self._tm_score = tm_score 62 self._qlength = qlength
63 64 @property
65 - def id(self):
66 return self._id
67 68 @property
69 - def qstart(self):
70 return self._qstart
71 72 @property
73 - def qend(self):
74 return self._qend
75 76 @property
77 - def qlength(self):
78 return self._qlength
79 80 @property
81 - def rmsd(self):
82 return self._rmsd
83 84 @property
85 - def tm_score(self):
86 return self._tm_score
87 88 @property
89 - def probability(self):
90 return self._probability
91 92 @property
93 - def length(self):
94 return self.qend - self.qstart + 1
95 96 @property
97 - def source_id(self):
98 raise NotImplementedError()
99 100 @property
101 - def start(self):
102 raise NotImplementedError()
103 104 @property
105 - def end(self):
106 raise NotImplementedError()
107
108 -class PredictionContainer(object):
109
110 - def __init__(self, target, isites_prediction, hmm_prediction, combined_prediction):
111 112 self.target = target 113 114 self.isites = isites_prediction 115 self.hmm = hmm_prediction 116 self.combined = combined_prediction
117
118 -class Prediction(object):
119
120 - def __init__(self, alignment, coordinates):
121 122 self.alignment = alignment 123 self.coordinates = coordinates
124
125 -class TorsionAnglesPredictor(object):
126 """ 127 Fragment-based phi/psi angles predictor. 128 129 @param target: target protein, containing fragment assignments 130 @type target: L{Target} 131 @param threshold: RMSD distance threshold for L{FragmentCluster}-based filtering 132 @type threshold: float 133 @param extend: pick alternative, longer cluster reps, if possible 134 @type extend: bool 135 @param init: populate all L{FragmentCluster}s on instantiation. If False, this step 136 will be performed on demand (the first time C{predictor.compute()} is invoked) 137 138 @note: if C{init} is False, the first call to C{predictor.compute()} might take a long 139 time. Subsequent calls will be very fast. 140 """ 141
142 - def __init__(self, target, threshold=1.5, extend=False, init=False):
143 144 if not isinstance(target, Target): 145 raise TypeError(target) 146 if target.matches.length == 0: 147 raise ValueError('This target has no fragment assignments') 148 149 self._target = target 150 self._threshold = float(threshold) 151 self._extend = bool(extend) 152 153 self._initialized = False 154 self._reps = {} 155 self._clusters = {} 156 157 if init: 158 self.init()
159 160 @property
161 - def target(self):
162 return self._target
163 164 @property
165 - def threshold(self):
166 return self._threshold
167 168 @property
169 - def extend(self):
170 return self._extend
171
172 - def init(self):
173 """ 174 Compute and cache all L{FragmentCluster}s. 175 """ 176 177 self._reps = {} 178 self._clusters = {} 179 180 for residue in self.target.residues: 181 cluster = self._filter(residue) 182 183 if cluster is not None: 184 rep = cluster.centroid() 185 if rep.has_alternative: 186 rep.exchange() 187 188 self._reps[residue.native.rank] = rep 189 self._clusters[residue.native.rank] = cluster.items 190 191 self._initialized = True
192
193 - def _filter(self, residue):
194 195 try: 196 nodes = [] 197 for ai in residue.assignments: 198 node = ClusterNode.create(ai.fragment) 199 nodes.append(node) 200 201 cluster = FragmentCluster(nodes, threshold=self.threshold) 202 cluster.shrink(minitems=0) 203 204 return cluster 205 206 except (ClusterExhaustedError, ClusterDivergingError): 207 return None
208
209 - def _residue(self, rank):
210 211 for r in self._target.residues: 212 if r.native.rank == rank: 213 return r 214 215 raise ValueError('Rank {0} is out of range'.format(rank))
216
217 - def compute_single(self, rank):
218 """ 219 Extract torsion angles from the L{ClusterRep} at residue C{#rank}. 220 221 @param rank: target residue rank 222 @type rank: int 223 224 @rtype: L{TorsionPredictionInfo} 225 """ 226 227 residue = self._residue(rank) 228 rep = residue.filter(threshold=self.threshold, extend=self.extend) 229 230 if rep is None: 231 return None 232 233 else: 234 fragment = rep.centroid 235 torsion = fragment.torsion_at(rank, rank)[0] 236 ss = fragment.sec_structure_at(rank, rank)[0] 237 238 return TorsionPredictionInfo(rank, rep.confidence, torsion, ss, primary=True)
239
240 - def compute(self, rank):
241 """ 242 Extract torsion angles from all L{ClusterRep}s, covering residue C{#rank}. 243 244 @param rank: target residue rank 245 @type rank: int 246 247 @return: L{TorsionPredictionInfo} instances, sorted by confidence 248 @rtype: tuple of L{TorsionPredictionInfo} 249 """ 250 251 if not self._initialized: 252 self.init() 253 254 prediction = [] 255 256 for rep in self._reps.values(): 257 258 if rep.centroid.qstart <= rank <= rep.centroid.qend: 259 260 fragment = rep.centroid 261 torsion = fragment.torsion_at(rank, rank)[0] 262 ss = fragment.sec_structure_at(rank, rank)[0] 263 info = TorsionPredictionInfo(rank, rep.confidence, torsion, ss) 264 265 if rep is self._reps.get(rank, None): 266 info.primary = True 267 268 prediction.append(info) 269 270 prediction.sort(reverse=True) 271 return tuple(prediction)
272
273 - def flat_torsion_map(self):
274 """ 275 Filter the current fragment map and create a new, completely flat, 276 non-overlapping map built from centroids, assigned iteratively by 277 decreasing confidence. Centroids with lower confidence which overlap 278 with previously assigned centroids will be trimmed to fill existing 279 gaps only. 280 281 @return: L{TorsionPredictionInfo} instances, one for each target residue 282 @rtype: tuple of L{TorsionPredictionInfo} 283 """ 284 285 if not self._initialized: 286 self.init() 287 288 prediction = [] 289 slots = set(range(1, self.target.length + 1)) 290 291 reps = list(self._reps.values()) 292 reps.sort(key=lambda i: i.confidence, reverse=True) 293 294 for rep in reps: 295 296 for rank in range(rep.centroid.qstart, rep.centroid.qend + 1): 297 if rank in slots: 298 torsion = rep.centroid.torsion_at(rank, rank)[0] 299 ss = rep.centroid.sec_structure_at(rank, rank)[0] 300 info = TorsionPredictionInfo(rank, rep.confidence, torsion, ss, primary=True) 301 302 prediction.append(info) 303 slots.remove(rank) 304 305 for rank in slots: 306 prediction.append(TorsionPredictionInfo(rank, 0, None)) 307 308 prediction.sort(key=lambda i: i.rank) 309 return tuple(prediction)
310
311 - def get_angles(self, rank):
312 """ 313 Extract all torsion angles coming from all fragments, which had survived 314 the filtering and cover residue C{#rank}. 315 316 @param rank: target residue rank 317 @type rank: int 318 319 @return: all L{TorsionAngles} for a cluster at the specified residue 320 @rtype: tuple of L{TorsionAngles} 321 """ 322 323 if not self._initialized: 324 self.init() 325 if rank not in self._clusters: 326 return tuple() 327 328 angles = [] 329 330 for node in self._clusters[rank]: 331 fragment = node.fragment 332 torsion = fragment.torsion_at(rank, rank)[0] 333 angles.append(torsion) 334 335 return tuple(angles)
336
337 338 -class TorsionPredictionInfo(object):
339 """ 340 Struct container for a single torsion angle prediction. 341 342 @param rank: target residue rank 343 @type rank: int 344 @param confidence: confidence of prediction 345 @type confidence: float 346 @param torsion: assigned phi/psi/omega angles 347 @type torsion: L{TorsionAngles} 348 @param dssp: assigned secondary structure 349 @type dssp: L{SecondaryStructureElement} 350 @param primary: if True, designates that the assigned angles are extracted 351 from the L{ClusterRep} at residue C{#rank}; otherwise: the 352 angles are coming from another, overlapping L{ClusterRep} 353 354 """ 355
356 - def __init__(self, rank, confidence, torsion, dssp, primary=False):
357 358 self.rank = rank 359 self.confidence = confidence 360 self.torsion = torsion 361 self.primary = primary 362 self.dssp = dssp
363
364 - def as_tuple(self):
365 """ 366 @return: convert this prediction to a tuple: (confidence, phi, psi, omega) 367 @rtype: tuple 368 """ 369 return tuple([self.confidence, self.torsion.phi, self.torsion.psi, self.torsion.omega])
370
371 - def __str__(self):
372 return '<TorsionPredictionInfo: {0.confidence:6.3f} at #{0.rank}>'.format(self)
373
374 - def __lt__(self, other):
375 return self.confidence < other.confidence
376
377 378 -class AssignmentFactory(object):
379
380 - def target(self, *a, **k):
381 return Target(*a, **k)
382
383 - def residue(self, *a, **k):
384 return TargetResidue(*a, **k)
385
386 - def assignment(self, *a, **k):
387 return Assignment(*a, **k)
388
389 -class ChemShiftAssignmentFactory(object):
390
391 - def target(self, *a, **k):
392 return ChemShiftTarget(*a, **k)
393
394 - def residue(self, *a, **k):
395 return ChemShiftTargetResidue(*a, **k)
396
397 - def assignment(self, *a, **k):
398 return ChemShiftAssignment(*a, **k)
399
400 -class Target(csb.core.AbstractNIContainer):
401 """ 402 Represents a protein structure prediction target. 403 404 @param id: target sequence ID, in PDB accnC format 405 @type id: str 406 @param length: total target sequence length 407 @type length: int 408 @param residues: a list, containing target's residues. See also 409 L{Target.from_sequence} 410 @type residues: iterable of L{csb.bio.structure.ProteinResidue}s 411 """ 412
413 - def __init__(self, id, length, residues, overlap=None, segments=None, factory=AssignmentFactory()):
414 415 self._id = id 416 self._accession = id[:-1] 417 self._chain_id = id[-1] 418 self._length = length 419 self._overlap = overlap 420 self._factory = factory 421 422 self._assignments = csb.core.ReadOnlyCollectionContainer(type=Assignment) 423 self._errors = csb.core.CollectionContainer() 424 425 resi = [factory.residue(native) for native in residues] 426 self._residues = csb.core.ReadOnlyCollectionContainer(items=resi, 427 type=TargetResidue, start_index=1) 428 429 if segments is not None: 430 segments = dict([(s.start, s) for s in segments]) 431 self._segments = csb.core.ReadOnlyDictionaryContainer(items=segments)
432 433 @staticmethod
434 - def from_sequence(id, sequence):
435 """ 436 Factory, which builds L{Target} objects from a bare sequence. 437 438 @param sequence: target's sequence 439 @type sequence: L{csb.bio.sequence.AbstractSequence}, str or iterable 440 441 @rtype: L{Target} 442 """ 443 444 if isinstance(sequence, csb.bio.sequence.Sequence): 445 sequence = sequence.sequence 446 447 residues = [] 448 449 for rn, aa in enumerate(sequence, start=1): 450 residue = csb.bio.structure.ProteinResidue(rank=rn, type=aa) 451 residues.append(residue) 452 453 return Target(id, len(residues), residues)
454 455 @staticmethod
456 - def from_profile(hmm):
457 """ 458 Factory, which builds L{Target} objects from an HMM profile. 459 460 @param hmm: target's HMM 461 @type hmm: L{csb.bio.hmm.ProfileHMM} 462 463 @rtype: L{Target} 464 """ 465 466 residues = [ r.clone() for r in hmm.residues ] 467 return Target(hmm.id, hmm.layers.length, residues)
468 469 @staticmethod
470 - def deserialize(pickle):
471 472 with open(pickle) as stream: 473 return csb.io.Pickle.load(stream)
474 475 @property
476 - def _children(self):
477 return self._residues
478 479 @property
480 - def errors(self):
481 return self._errors
482 483 @property
484 - def id(self):
485 return self._id
486 487 @property
488 - def accession(self):
489 return self._accession
490 491 @property
492 - def chain_id(self):
493 return self._chain_id
494 495 @property
496 - def max_overlap(self):
497 return self._overlap
498 499 @property
500 - def length(self):
501 return self._length
502 503 @property
504 - def sequence(self):
505 return ''.join(str(r.native.type) for r in self)
506 507 @property
508 - def matches(self):
509 return self._assignments
510 511 @property
512 - def residues(self):
513 return self._residues
514 515 @property
516 - def segments(self):
517 return self._segments
518
519 - def assign(self, fragment):
520 """ 521 Add a new fragment match. 522 @param fragment: fragment to assign 523 @type fragment: L{Assignment} 524 """ 525 526 if not 1 <= fragment.qstart <= fragment.qend <= len(self._residues): 527 raise ValueError("Fragment out of range") 528 529 self._assignments._append_item(fragment) 530 531 for rank in range(fragment.qstart, fragment.qend + 1): 532 ai = ResidueAssignmentInfo(fragment, rank) 533 self._residues[rank].assign(ai) 534 535 if fragment.segment is not None: 536 try: 537 self._segments[fragment.segment].assign(fragment) 538 except KeyError: 539 raise ValueError("Undefined segment starting at {0}".format(fragment.segment))
540
541 - def assignall(self, fragments):
542 """ 543 Assign a bunch of fragments at once. 544 @type fragments: iterable of L{Assignment}s 545 """ 546 for frag in fragments: 547 self.assign(frag)
548
549 - def filter(self, threshold=1.5, extend=False):
550 """ 551 Filter the current fragment map using a L{FragmentCluster}. 552 553 @param threshold: cluster RMSD threshold (see L{FragmentCluster}) 554 @type threshold: float 555 @param extend: pick extended alternatives where possible (default=False) 556 @type extend: bool 557 558 @return: a new target, containing only cluster centroids/reps 559 @rtype: L{Target} 560 """ 561 562 target = self.clone() 563 564 for residue in self.residues: 565 rep = residue.filter(threshold=threshold, extend=extend) 566 567 if rep is not None: 568 target.assign(rep.centroid) 569 570 return target
571
572 - def clone(self):
573 """ 574 @return: a deep copy of the target 575 @rtype: L{Target} 576 """ 577 578 segments = [self.segments[start] for start in self.segments] 579 segments = [TargetSegment(s.start, s.end, s.count) for s in segments] 580 581 target = self._factory.target(self.id, self.length, [r.native for r in self.residues], 582 overlap=self._overlap, segments=segments) 583 584 return target
585
586 -class ChemShiftTarget(Target):
587
588 - def __init__(self, id, length, residues, overlap=None):
592
593 - def assign(self, fragment):
594 595 if not 1 <= fragment.qstart <= fragment.qend <= len(self._residues): 596 raise ValueError("Fragment out of range") 597 598 self._assignments._append_item(fragment) 599 600 rank = fragment.qstart 601 ai = ResidueAssignmentInfo(fragment, rank) 602 self._residues[rank].assign(ai)
603
604 - def clone(self):
605 return self._factory.target(self.id, self.length, [r.native for r in self.residues], 606 overlap=self._overlap)
607
608 -class TargetResidue(object):
609 """ 610 Wrapper around L{Target}'s native residues. Decorates them with additional, 611 fragment-related methods. 612 613 @type native_residue: L{csb.bio.structure.ProteinResidue} 614 """ 615
616 - def __init__(self, native_residue):
617 618 self._type = native_residue.type 619 self._native = native_residue.clone() 620 self._assignments = csb.core.ReadOnlyCollectionContainer(type=ResidueAssignmentInfo)
621 622 @property
623 - def type(self):
624 return self._type
625 626 @property
627 - def native(self):
628 return self._native
629 630 @property
631 - def assignments(self):
632 return self._assignments
633
634 - def assign(self, assignment_info):
635 self._assignments._append_item(assignment_info)
636
637 - def verybest(self):
638 """ 639 @return: the fragment with the lowest RMSD at this position in the L{Target} 640 @rtype: L{Assignment} 641 """ 642 643 best = None 644 645 for ai in self.assignments: 646 a = ai.fragment 647 if a.length < FragmentCluster.MIN_LENGTH: 648 continue 649 if best is None or a.rmsd < best.rmsd: 650 best = a 651 elif a.rmsd == best.rmsd and a.length > best.length: 652 best = a 653 654 return best
655
656 - def filter(self, method=Metrics.RMSD, threshold=1.5, extend=False):
657 """ 658 Filter all fragments, covering this position in the L{Target} using a 659 L{FragmentCluster}. 660 661 @param method: one of the L{Metrics} members (default=L{Metrics.RMSD}) 662 @type method: str 663 @param threshold: cluster RMSD threshold (see L{FragmentCluster}) 664 @type threshold: float 665 @param extend: pick extended alternative where possible (default=False) 666 @type extend: bool 667 668 @return: cluster's representative (if converged) or None 669 @rtype: L{ClusterRep} or None 670 """ 671 672 try: 673 nodes = [] 674 for ai in self.assignments: 675 node = ClusterNode.create(ai.fragment, method, extend) 676 nodes.append(node) 677 678 cluster = FragmentCluster(nodes, threshold=threshold) 679 680 center = cluster.shrink(minitems=0) 681 if center.has_alternative: 682 center.exchange() 683 684 return center 685 686 except (ClusterExhaustedError, ClusterDivergingError): 687 return None
688
689 - def longest(self):
690 """ 691 @return: the longest fragment, covering the current position 692 @rtype: L{Assignment} 693 """ 694 best = None 695 696 for q in self.assignments: 697 if best is None or (q.fragment.length > best.length): 698 best = q.fragment 699 700 return best
701
702 - def precision(self, threshold=1.5):
703 """ 704 @return: the residue-wise precision of the fragment library at the 705 current position (percentage). 706 707 @param threshold: true-positive RMSD cutoff (default=1.5) 708 @type threshold: float 709 @rtype: float 710 """ 711 712 if self.assignments.length < 1: 713 return None 714 else: 715 positive = [a for a in self.assignments if a.fragment.rmsd <= threshold] 716 pos = len(positive) * 100.0 / self.assignments.length 717 718 return pos
719
720 -class ChemShiftTargetResidue(TargetResidue):
721
722 - def verybest(self):
723 724 best = None 725 726 for ai in self.assignments: 727 a = ai.fragment 728 729 if a.score < ChemShiftAssignment.BIT_SCORE_THRESHOLD * a.window: 730 continue 731 732 if best is None or a.score > best.score: 733 best = a 734 elif a.score == best.score and a.length > best.length: 735 best = a 736 737 return best
738
739 -class TargetSegment(object):
740
741 - def __init__(self, start, end, count):
742 743 self._start = start 744 self._end = end 745 self._count = count 746 747 self._assignments = csb.core.ReadOnlyCollectionContainer(type=Assignment)
748 749 @property
750 - def count(self):
751 return self._count
752 753 @property
754 - def start(self):
755 return self._start
756 757 @property
758 - def end(self):
759 return self._end
760 761 @property
762 - def length(self):
763 return (self._end - self._start + 1)
764 765 @property
766 - def assignments(self):
767 return self._assignments
768
769 - def assign(self, fragment):
770 if fragment.segment != self.start: 771 raise ValueError('Segment origin mismatch: {0} vs {1}'.format(fragment.segment, self.start)) 772 else: 773 self._assignments._append_item(fragment)
774
775 - def verybest(self):
776 777 best = None 778 779 for a in self.assignments: 780 if a.length < FragmentCluster.MIN_LENGTH: 781 continue 782 if best is None or a.rmsd < best.rmsd: 783 best = a 784 elif a.rmsd == best.rmsd and a.length > best.length: 785 best = a 786 787 return best
788
789 - def best(self, method=Metrics.RMSD):
790 791 try: 792 cluster = FragmentCluster(self.assignments, threshold=1.5, 793 connectedness=0.5, method=method) 794 centroid = cluster.shrink(minitems=1) 795 return centroid 796 797 except ClusterExhaustedError: 798 return None 799 finally: 800 del cluster
801
802 - def longest(self):
803 804 best = None 805 806 for q in self.assignments: 807 if best is None or (q.length > best.length): 808 best = q 809 810 return best
811
812 - def pairwise_rmsd(self, min_overlap=5):
813 814 rmsds = [] 815 816 for q in self.assignments: 817 for s in self.assignments: 818 if q is not s: 819 r = q.rmsd_to(s, min_overlap) 820 if r is not None: 821 rmsds.append(r) 822 else: 823 assert q.rmsd_to(s, 1) < 0.01 824 825 return rmsds
826
827 - def pairwise_mda(self, min_overlap=5):
828 829 mdas = [] 830 831 for q in self.assignments: 832 for s in self.assignments: 833 if q is not s: 834 m = q.mda_to(s, min_overlap) 835 if m is not None: 836 mdas.append(m) 837 return mdas
838
839 - def pairwise_sa_rmsd(self, profiles='.', min_overlap=5):
840 841 from csb.bio.hmm import RELATIVE_SA 842 from csb.bio.io.hhpred import ScoreUnits, HHProfileParser 843 844 def convert_sa(sa): 845 return numpy.array([ RELATIVE_SA[i] for i in sa ])
846 847 sources = {} 848 scores = [] 849 850 for q in self.assignments: 851 for s in self.assignments: 852 853 if s.source_id not in sources: 854 hmm = HHProfileParser(os.path.join(profiles, s.source_id + '.hhm')).parse() 855 sources[s.source_id] = hmm.dssp_solvent 856 857 if q is not s: 858 859 common = q.overlap(s) 860 if len(common) >= min_overlap: 861 862 qsa = q.solvent_at(sources[q.source_id], min(common), max(common)) 863 ssa = s.solvent_at(sources[s.source_id], min(common), max(common)) 864 865 if '-' in qsa + ssa: 866 continue 867 868 qsa = convert_sa(qsa) 869 ssa = convert_sa(ssa) 870 assert len(qsa) == len(ssa) 871 sa_rmsd = numpy.sqrt(numpy.sum((qsa - ssa) ** 2) / float(len(qsa))) 872 873 scores.append(sa_rmsd) 874 return scores
875
876 - def pairwise_scores(self, profiles='.', min_overlap=5):
877 878 from csb.bio.hmm import BACKGROUND 879 back = numpy.sqrt(numpy.array(BACKGROUND)) 880 881 sources = {} 882 scores = [] 883 884 for q in self.assignments: 885 for s in self.assignments: 886 887 if s.source_id not in sources: 888 # hmm = HHProfileParser(os.path.join(hmm_path, s.source_id + '.hhm')).parse(ScoreUnits.Probability) 889 sources[s.source_id] = csb.io.Pickle.load(open(os.path.join(profiles, s.source_id + '.pkl'), 'rb')) 890 891 if q is not s: 892 893 common = q.overlap(s) 894 if len(common) >= min_overlap: 895 896 qprof = q.profile_at(sources[q.source_id], min(common), max(common)) 897 sprof = s.profile_at(sources[s.source_id], min(common), max(common)) 898 899 #score = qhmm.emission_similarity(shmm) 900 assert len(qprof) == len(sprof) 901 dots = [ numpy.dot(qprof[i] / back, sprof[i] / back) for i in range(len(qprof)) ] 902 score = numpy.log(numpy.prod(dots)) 903 if score is not None: 904 scores.append(score) 905 return scores
906
907 - def _entropy(self, data, binsize):
908 909 binsize = float(binsize) 910 bins = numpy.ceil(numpy.array(data) / binsize) 911 912 hist = dict.fromkeys(bins, 0) 913 for bin in bins: 914 hist[bin] += (1.0 / len(bins)) 915 916 freq = numpy.array(hist.values()) 917 return - numpy.sum(freq * numpy.log(freq))
918
919 - def rmsd_entropy(self, binsize=0.1):
920 921 rmsds = self.pairwise_rmsd() 922 return self._entropy(rmsds, binsize)
923
924 - def score_entropy(self, profiles='.', binsize=1):
925 926 scores = self.pairwise_scores(profiles) 927 return self._entropy(scores, binsize)
928
929 - def rmsd_consistency(self, threshold=1.5):
930 931 rmsds = self.pairwise_rmsd() 932 933 if len(rmsds) < 1: 934 return None 935 936 return sum([1 for i in rmsds if i <= threshold]) / float(len(rmsds))
937
938 - def sa_rmsd_consistency(self, threshold=0.4, profiles='.'):
939 940 sa_rmsds = self.pairwise_sa_rmsd(profiles=profiles) 941 942 if len(sa_rmsds) < 1: 943 return None 944 945 return sum([1 for i in sa_rmsds if i <= threshold]) / float(len(sa_rmsds))
946
947 - def true_positives(self, threshold=1.5):
948 949 if self.assignments.length < 1: 950 return None 951 952 return sum([1 for i in self.assignments if i.rmsd <= threshold]) / float(self.assignments.length)
953
954 - def confidence(self):
955 956 cons = self.rmsd_consistency() 957 958 if cons is None: 959 return 0 960 else: 961 return numpy.log10(self.count) * cons
962
963 -class ResidueAssignmentInfo(object):
964
965 - def __init__(self, assignment, rank):
966 967 if not assignment.qstart <= rank <= assignment.qend: 968 raise ValueError('Rank {0} is not matched by this assignment') 969 970 self._assignment = assignment 971 self._rank = rank 972 self._relrank = rank - assignment.qstart
973 974 @property
975 - def c_alpha(self):
976 return self._assignment.backbone[self._relrank]
977 978 @property
979 - def fragment(self):
980 return self._assignment
981
982 -class Assignment(FragmentMatch):
983 """ 984 Represents a match between a fragment and its target. 985 986 @param source: source structure (must have torsion angles precomputed) 987 @type source: L{csb.bio.structure.Chain} 988 @param start: start position in C{source} (rank) 989 @type start: int 990 @param end: end position in C{source} (rank) 991 @type end: int 992 @param id: fragment ID 993 @type id: str 994 @param qstart: start position in target (rank) 995 @type qstart: int 996 @param qend: end position in target (rank) 997 @type qend: int 998 @param probability: probability of assignment 999 @type probability: float 1000 @param rmsd: RMSD of the fragment, compared to target's native structure 1001 @type rmsd: float 1002 """ 1003
1004 - def __init__(self, source, start, end, qstart, qend, id=None, probability=None, rmsd=None, 1005 tm_score=None, score=None, neff=None, segment=None, internal_id=None):
1006 1007 assert source.has_torsion 1008 sub = source.subregion(start, end, clone=True) 1009 try: 1010 calpha = [r.atoms['CA'].vector.copy() for r in sub.residues] 1011 except csb.core.ItemNotFoundError: 1012 raise csb.bio.structure.Broken3DStructureError() 1013 torsion = [r.torsion.copy() for r in sub.residues] 1014 1015 self._calpha = csb.core.ReadOnlyCollectionContainer(items=calpha, type=numpy.ndarray) 1016 self._torsion = torsion 1017 self._sequence = sub.sequence 1018 1019 self._source_id = source.accession[:4] + source.id 1020 self._start = start 1021 self._end = end 1022 1023 self._score = score 1024 self._neff = neff 1025 self._ss = None 1026 1027 self._segment_start = segment 1028 self.internal_id = internal_id 1029 1030 if id is None: 1031 id = "{0}:{1}-{2}".format(self.source_id, self.start, self.end) 1032 1033 super(Assignment, self).__init__(id, qstart, qend, probability, rmsd, tm_score, None) 1034 1035 self._ss = SecondaryStructure('-' * self.length)
1036 1037 @staticmethod
1038 - def from_fragment(fragment, provider):
1039 """ 1040 Create a new L{Assignment} given a source rosetta fragment. 1041 1042 @param fragment: rosetta fragment 1043 @type fragment: L{RosettaFragment} 1044 @param provider: PDB database provider 1045 @type provider: L{StructureProvider} 1046 1047 @rtype: L{Assignment} 1048 """ 1049 try: 1050 structure = provider.get(fragment.accession) 1051 except KeyError: 1052 structure = provider.get(fragment.source_id) 1053 source = structure.chains[fragment.chain] 1054 source.compute_torsion() 1055 1056 id = "{0}:{1}-{2}".format(fragment.source_id, fragment.start, fragment.end) 1057 1058 return Assignment(source, fragment.start, fragment.end, 1059 fragment.qstart, fragment.qend, id, 0, 0)
1060 1061 @property
1062 - def backbone(self):
1063 return self._calpha
1064 1065 @property
1066 - def sequence(self):
1067 return self._sequence
1068 1069 @property
1070 - def torsion(self):
1071 return self._torsion
1072 1073 @property
1074 - def source_id(self):
1075 return self._source_id
1076 1077 @property
1078 - def start(self):
1079 return self._start
1080 1081 @property
1082 - def end(self):
1083 return self._end
1084 1085 @property
1086 - def score(self):
1087 return self._score
1088 1089 @property
1090 - def neff(self):
1091 return self._neff
1092 1093 @property
1094 - def segment(self):
1095 return self._segment_start
1096 1097 @property
1098 - def secondary_structure(self):
1099 return self._ss
1100 @secondary_structure.setter
1101 - def secondary_structure(self, value):
1102 1103 if isinstance(value, csb.core.string): 1104 value = csb.bio.structure.SecondaryStructure(value) 1105 if len(str(value)) != self.length:#(value.end - value.start + 1) != self.length: 1106 raise ValueError("Invalid secondary structure length", len(str(value)), self.length ) 1107 1108 self._ss = value
1109
1110 - def transform(self, rotation, translation):
1111 """ 1112 Apply rotation/translation to fragment's coordinates in place. 1113 """ 1114 1115 for ca in self.backbone: 1116 newca = numpy.dot(ca, numpy.transpose(rotation)) + translation 1117 for i in range(3): 1118 ca[i] = newca[i]
1119
1120 - def _check_range(self, qstart, qend):
1121 1122 if not (self.qstart <= qstart <= qend <= self.qend): 1123 raise ValueError('Region {0}..{1} is out of range {2.qstart}..{2.qend}'.format(qstart, qend, self))
1124
1125 - def anchored_around(self, rank):
1126 """ 1127 @return: True if the fragment is centered around position=C{rank}. 1128 @rtype: bool 1129 """ 1130 1131 if self.qstart < rank < self.qend: 1132 if (rank - self.qstart + 1) > 0.4 * (self.qend - self.qstart + 1): 1133 return True 1134 1135 return False
1136
1137 - def backbone_at(self, qstart, qend):
1138 """ 1139 @return: the CA coordinates of the fragment at the specified subregion. 1140 @rtype: list 1141 """ 1142 1143 self._check_range(qstart, qend) 1144 1145 relstart = qstart - self.qstart 1146 relend = qend - self.qstart + 1 1147 1148 return self.backbone[relstart : relend]
1149
1150 - def torsion_at(self, qstart, qend):
1151 """ 1152 @return: the torsion angles of the fragment at the specified subregion. 1153 @rtype: list 1154 """ 1155 1156 self._check_range(qstart, qend) 1157 1158 relstart = qstart - self.qstart 1159 relend = qend - self.qstart + 1 1160 1161 return self.torsion[relstart : relend]
1162
1163 - def solvent_at(self, sa_string, qstart, qend):
1164 1165 self._check_range(qstart, qend) 1166 1167 relstart = qstart - self.qstart 1168 relend = qend - self.qstart + 1 1169 1170 return sa_string[relstart : relend]
1171
1172 - def sec_structure_at(self, qstart, qend):
1173 1174 self._check_range(qstart, qend) 1175 start = qstart - self.qstart + 1 1176 end = qend - self.qstart + 1 1177 1178 return self.secondary_structure.scan(start, end, loose=True, cut=True)
1179
1180 - def profile_at(self, source, qstart, qend):
1181 1182 self._check_range(qstart, qend) 1183 1184 start = qstart - self.qstart + self.start 1185 end = qend - self.qstart + self.start 1186 1187 if hasattr(source, 'subregion'): 1188 return source.subregion(start, end) 1189 else: 1190 return source[start - 1 : end]
1191
1192 - def chain_at(self, source, qstart, qend):
1193 1194 self._check_range(qstart, qend) 1195 1196 start = qstart - self.qstart + self.start 1197 end = qend - self.qstart + self.start 1198 1199 return source.subregion(start, end)
1200
1201 - def overlap(self, other):
1202 """ 1203 @type other: L{Assignment} 1204 @return: target positions, covered by both C{self} and C{other} 1205 @rtype: set of int 1206 """ 1207 1208 qranks = set(range(self.qstart, self.qend + 1)) 1209 sranks = set(range(other.qstart, other.qend + 1)) 1210 1211 return qranks.intersection(sranks)
1212
1213 - def rmsd_to(self, other, min_overlap=5):
1214 """ 1215 @return: the CA RMSD between C{self} and C{other}. 1216 1217 @param other: another fragment 1218 @type other: L{Assignment} 1219 @param min_overlap: require at least that number of overlapping residues 1220 (return None if not satisfied) 1221 @type min_overlap: int 1222 1223 @rtype: float 1224 """ 1225 1226 common = self.overlap(other) 1227 1228 if len(common) >= min_overlap: 1229 1230 qstart, qend = min(common), max(common) 1231 1232 q = self.backbone_at(qstart, qend) 1233 s = other.backbone_at(qstart, qend) 1234 1235 if len(q) > 0 and len(s) > 0: 1236 return csb.bio.utils.rmsd(numpy.array(q), numpy.array(s)) 1237 1238 return None
1239
1240 - def nrmsd_to(self, other, min_overlap=5):
1241 1242 common = self.overlap(other) 1243 1244 if len(common) >= min_overlap: 1245 1246 qstart, qend = min(common), max(common) 1247 1248 q = self.backbone_at(qstart, qend) 1249 s = other.backbone_at(qstart, qend) 1250 1251 if len(q) > 0 and len(s) > 0: 1252 return csb.bio.utils.rmsd(q, s) / RANDOM_RMSD[ len(common) ] 1253 1254 return None
1255
1256 - def mda_to(self, other, min_overlap=5):
1257 1258 common = self.overlap(other) 1259 1260 if len(common) >= min_overlap: 1261 1262 qstart, qend = min(common), max(common) 1263 1264 q = self.torsion_at(qstart, qend) 1265 s = other.torsion_at(qstart, qend) 1266 1267 if len(q) > 0 and len(s) > 0: 1268 1269 maxphi = max(numpy.abs(i.phi - j.phi) for i, j in zip(q, s)[1:]) # phi: 2 .. L 1270 maxpsi = max(numpy.abs(i.psi - j.psi) for i, j in zip(q, s)[:-1]) # psi: 1 .. L-1 1271 1272 return max(maxphi, maxpsi) 1273 1274 return None
1275
1276 - def to_rosetta(self, source, qstart=None, qend=None, weight=None):
1277 """ 1278 @deprecated: this method will be deleted soon. Use 1279 L{csb.bio.fragments.rosetta.OutputBuilder} instead. 1280 """ 1281 stream = csb.io.MemoryStream() 1282 1283 if weight is None: 1284 weight = self.probability 1285 if not qstart: 1286 qstart = self.qstart 1287 if not qend: 1288 qend = self.qend 1289 1290 source.compute_torsion() 1291 chain = self.chain_at(source, qstart, qend) 1292 1293 for i, r in enumerate(chain.residues): 1294 1295 acc = self.source_id[:4] 1296 ch = self.source_id[4].upper() 1297 1298 start = qstart - self.qstart + self.start + i 1299 aa = r.type 1300 ss = 'L' 1301 phi, psi, omega = 0, 0, 0 1302 if r.torsion.phi: 1303 phi = r.torsion.phi 1304 if r.torsion.psi: 1305 psi = r.torsion.psi 1306 if r.torsion.omega: 1307 omega = r.torsion.omega 1308 1309 stream.write(' {0:4} {1:1} {2:>5} {3!s:1} {4!s:1} {5:>8.3f} {6:>8.3f} {7:>8.3f} {8:>8.3f}\n'.format(acc, ch, start, aa, ss, phi, psi, omega, weight)) 1310 1311 return stream.getvalue()
1312
1313 -class ChemShiftAssignment(Assignment):
1314 1315 BIT_SCORE_THRESHOLD = 1.1 1316
1317 - def __init__(self, source, start, end, qstart, qend, window, score, rmsd):
1318 1319 self._window = window 1320 1321 super(ChemShiftAssignment, self).__init__( 1322 source, start, end, qstart, qend, id=None, probability=1.0, 1323 rmsd=rmsd, tm_score=None, score=score, neff=None, segment=None, internal_id=None)
1324 1325 @property
1326 - def window(self):
1327 return self._window
1328
1329 -class ClusterExhaustedError(ValueError):
1330 pass
1331
1332 -class ClusterEmptyError(ClusterExhaustedError):
1333 pass
1334
1335 -class ClusterDivergingError(RuntimeError):
1336 pass
1337
1338 -class FragmentCluster(object):
1339 """ 1340 Provides clustering/filtering of the fragments, covering a common residue 1341 in the target. Clustering is done via iterative shrinking of the cluster. 1342 At each iteration, node rejection (deletion) is attempted for each node. The 1343 node rejection, causing the most significant drop in the average pairwise 1344 distance (RMSD) in the cluster, is retained. This procedure is repeated 1345 until: 1) the average pairwise RMSD drops below the C{threshold} (converged), 1346 2) the cluster gets exhausted or 3) node rejection no longer 1347 causes a drop in the average distance (not converging). 1348 1349 @param items: cluster members 1350 @type items: iterable of L{ClusterNode}s 1351 @param threshold: RMSD threshold; continue shrinking until the mean distance 1352 drops below this value (default=1.5) 1353 @type threshold: float 1354 @param connectedness: use only nodes which are connected to at least c% of all 1355 initial nodes (default=0.5, that means 50%) 1356 @type connectedness: float 1357 """ 1358 1359 MIN_LENGTH = 6 1360
1361 - def __init__(self, items, threshold=1.5, connectedness=0.5):
1362 1363 items = set(i for i in items if i.fragment.length >= FragmentCluster.MIN_LENGTH) 1364 1365 self._matrix = {} 1366 self._threshold = float(threshold) 1367 self._connectedness = float(connectedness) 1368 self._weight = 0 1369 self._edges = 0 1370 1371 for i in items: 1372 1373 self._matrix[i] = {} 1374 #conn = 0.0 1375 1376 for j in items: 1377 distance = i.distance(j) 1378 if distance is not None: 1379 #conn += 1 1380 self._matrix[i][j] = distance 1381 self._edges += 1 1382 self._weight += distance 1383 i.weight += distance 1384 1385 #if conn / len(items) < self.connectedness: 1386 # # reject i as a first class node 1387 # del self._matrix[i] 1388 1389 self._items = set(self._matrix.keys()) 1390 1391 if len(self._items) < 1: 1392 raise ClusterEmptyError() 1393 1394 self._initcount = self.count
1395 1396 @property
1397 - def count(self):
1398 return len(self._items)
1399 1400 @property
1401 - def items(self):
1402 return tuple(self._items)
1403 1404 @property
1405 - def fragments(self):
1406 return tuple(i.fragment for i in self._items)
1407 1408 @property
1409 - def threshold(self):
1410 return self._threshold
1411 @threshold.setter
1412 - def threshold(self, value):
1413 self._threshold = float(value)
1414 1415 @property
1416 - def connectedness(self):
1417 return self._connectedness
1418
1419 - def _distances(self, skip=None):
1420 1421 d = [] 1422 1423 for i in self._matrix: 1424 if skip is i: 1425 continue 1426 1427 for j in self._matrix[i]: 1428 if skip is not j: 1429 d.append(self._matrix[i][j]) 1430 1431 return d
1432
1433 - def _distance(self, i, j):
1434 1435 if j in self._matrix[i]: 1436 return self._matrix[i][j] 1437 else: 1438 return None
1439
1440 - def mean(self, skip=None):
1441 """ 1442 @return: the current mean distance in the cluster 1443 @rtype: float 1444 """ 1445 if self._edges == 0: 1446 raise ClusterExhaustedError() 1447 1448 if not skip: 1449 return float(self._weight) / self._edges 1450 1451 else: 1452 weight = self._weight - 2 * skip.weight 1453 edges = self._edges - 2 * len(self._matrix[skip]) 1454 1455 if edges < 1: 1456 return 0 1457 else: 1458 return float(weight) / edges
1459
1460 - def centroid(self):
1461 """ 1462 @return: the current representative fragment 1463 @rtype: L{ClusterRep} 1464 1465 @note: the cluster rep is the node with the lowest average distance 1466 to all other nodes. If a fixed fragment exists, structurally similar 1467 to the rep, but longer, this fragment may be suggested as an alternative 1468 (see also L{ClusterRep}). 1469 """ 1470 1471 alt = None 1472 cen = None 1473 avg = None 1474 1475 for i in self._matrix: 1476 1477 curravg = float(i.weight) / len(self._matrix[i]) 1478 conn = len(self._matrix[i]) / float(self.count) 1479 1480 if avg is None or (curravg < avg and conn >= self.connectedness): 1481 avg = curravg 1482 cen = i 1483 elif curravg == avg: 1484 if i.fragment.length > cen.fragment.length: 1485 cen = i 1486 1487 d = self._distances() 1488 mean = numpy.mean(d) 1489 cons = sum(1.0 for i in d if i <= self.threshold) / len(d) 1490 1491 for i in self._matrix: 1492 if i is not cen and i.fixed and i.fragment.length > cen.fragment.length: 1493 distance = self._distance(i, cen) 1494 if distance is not None and distance < 0.5 * self.threshold: 1495 if alt is None or alt.fragment.length < i.fragment.length: 1496 alt = i 1497 1498 return ClusterRep(cen, mean, cons, len(self._matrix[cen]), alternative=alt, 1499 rejections=(self._initcount - self.count))
1500
1501 - def reject(self, item):
1502 """ 1503 Remove C{item} from the cluster. 1504 1505 @type item: L{ClusterNode} 1506 @raise ClusterExhaustedError: if this is the last remaining item 1507 """ 1508 if self.count == 1: 1509 raise ClusterExhaustedError() 1510 1511 assert not item.fixed 1512 1513 for i in self._matrix: 1514 if item in self._matrix[i]: 1515 distance = self._matrix[i][item] 1516 self._weight -= 2 * distance 1517 i.weight -= distance 1518 1519 del self._matrix[i][item] 1520 self._edges -= 1 1521 1522 self._edges -= len(self._matrix[item]) 1523 del self._matrix[item] 1524 self._items.remove(item)
1525
1526 - def shrinkone(self):
1527 """ 1528 Shrink the cluster by a single node. 1529 1530 @return: True on successful shrink, False otherwise (e.g. if 1531 already converged) 1532 @rtype: bool 1533 @raise ClusterExhaustedError: if exhausted 1534 @raise ClusterDivergingError: if not converging 1535 """ 1536 1537 mean = self.mean() 1538 if mean <= self.threshold or self.count == 1: 1539 return False # already shrunk enough 1540 1541 m = {} 1542 1543 for i in self._matrix: 1544 if not i.fixed: 1545 newmean = self.mean(skip=i) 1546 m[newmean] = i 1547 1548 if len(m) == 0: # only fixed items remaining 1549 raise ClusterExhaustedError() 1550 1551 newmean = min(m) 1552 1553 if newmean > mean: 1554 raise ClusterDivergingError() # can't converge, usually when fixed items are too far away from the average 1555 elif newmean < mean: 1556 junk = m[newmean] 1557 self.reject(junk) 1558 return True # successful shrink 1559 else: 1560 return False # converged
1561
1562 - def shrink(self, minitems=2):
1563 """ 1564 Start automatic shrinking. 1565 1566 @param minitems: absolute minimum of the number of nodes in the cluster 1567 @type minitems: int 1568 1569 @return: cluster's representative: the node with the lowest average 1570 distance to all other nodes in the cluster 1571 @rtype: L{ClusterRep} 1572 1573 @raise ClusterExhaustedError: if C{self.count} < C{minitems} and 1574 still not converged 1575 """ 1576 1577 if self.count > minitems: 1578 1579 while self.shrinkone(): 1580 if self.count <= minitems: 1581 raise ClusterExhaustedError() 1582 else: 1583 raise ClusterExhaustedError() 1584 1585 return self.centroid()
1586
1587 -class ClusterNode(object):
1588 """ 1589 Cluster node. 1590 1591 @param fragment: fragment 1592 @type fragment: L{Assignment} 1593 @param distance: distance metric (a L{Metrics} member, default is RMSD) 1594 @type distance: str 1595 @param fixed: mark this node as fixed (cannot be rejected) 1596 @type fixed: bool 1597 """ 1598 1599 FIXED = 0.7 1600 1601 @staticmethod
1602 - def create(fragment, method=Metrics.RMSD, extend=False):
1603 """ 1604 Create a new L{ClusterNode} given a specified C{Assignment}. If this 1605 assignment is a high probability match, define it as a fixed fragment. 1606 1607 @rtype: L{ClusterNode} 1608 """ 1609 if fragment.probability > ClusterNode.FIXED and fragment.length >= FragmentCluster.MIN_LENGTH: 1610 return ClusterNode(fragment, distance=method, fixed=extend) 1611 else: 1612 return ClusterNode(fragment, distance=method, fixed=False)
1613
1614 - def __init__(self, fragment, distance=Metrics.RMSD, fixed=False):
1615 1616 if fixed and fragment.length < FragmentCluster.MIN_LENGTH: 1617 raise ValueError("Can't fix a short fragment") 1618 1619 self.fragment = fragment 1620 self.fixed = bool(fixed) 1621 self.weight = 0 1622 1623 self._distance = getattr(self.fragment, distance)
1624
1625 - def distance(self, other):
1626 """ 1627 @return: the distance between self and another node 1628 @type other: L{ClusterNode} 1629 @rtype: float 1630 """ 1631 return self._distance(other.fragment)
1632
1633 -class ClusterRep(object):
1634 """ 1635 Cluster's representative (centroid) node. This object carries the 1636 result of shrinking itself. 1637 1638 @param centroid: rep node 1639 @type centroid: L{ClusterNode} 1640 @param mean: current mean distance in the cluster 1641 @type mean: float 1642 @param consistency: percentage of pairwise distances below the RMSD C{threshold} 1643 @type consistency: float 1644 @param count: current number of nodes in the cluster 1645 @type count: int 1646 @param rejections: total number of rejections 1647 @type rejections: int 1648 @param alternative: suggested cluster rep alternative (e.g. structurally 1649 similar to the centroid, but longer) 1650 @type param: 1651 """ 1652
1653 - def __init__(self, centroid, mean, consistency, count, rejections=0, alternative=None):
1654 1655 if isinstance(centroid, ClusterNode): 1656 centroid = centroid.fragment 1657 if isinstance(alternative, ClusterNode): 1658 alternative = alternative.fragment 1659 1660 self._centroid = centroid 1661 self._alternative = alternative 1662 self._mean = mean 1663 self._consistency = consistency 1664 self._count = count 1665 self._rejections = rejections
1666 1667 @property
1668 - def confidence(self):
1669 """ 1670 Confidence of assignment: log10(count) * consistency 1671 """ 1672 if self.count <= 0 or self.count is None or self.consistency is None: 1673 return 0 1674 else: 1675 return numpy.log10(self.count) * self.consistency
1676 1677 @property
1678 - def centroid(self):
1679 return self._centroid
1680 1681 @property
1682 - def alternative(self):
1683 return self._alternative
1684 1685 @property
1686 - def has_alternative(self):
1687 return self._alternative is not None
1688 1689 @property
1690 - def mean(self):
1691 return self._mean
1692 1693 @property
1694 - def consistency(self):
1695 return self._consistency
1696 1697 @property
1698 - def count(self):
1699 return self._count
1700 1701 @property
1702 - def rejections(self):
1703 return self._rejections
1704
1705 - def exchange(self):
1706 """ 1707 If an alternative is available, swap the centroid and the alternative. 1708 """ 1709 1710 if self._alternative is not None: 1711 1712 centroid = self._centroid 1713 self._centroid = self._alternative 1714 self._alternative = centroid
1715
1716 - def to_rosetta(self, source):
1717 """ 1718 @deprecated: this method is obsolete and will be deleted soon 1719 """ 1720 return self.centroid.to_rosetta(source, weight=self.confidence)
1721
1722 -class AdaptedAssignment(object):
1723 1724 @staticmethod
1725 - def with_overhangs(center, start, end, overhang=1):
1726 1727 if center.centroid.qstart <= (start - overhang): 1728 start -= overhang 1729 elif center.centroid.qstart < start: 1730 start = center.centroid.qstart 1731 1732 if center.centroid.qend >= (end + overhang): 1733 end += overhang 1734 elif center.centroid.qend > end: 1735 end = center.centroid.end 1736 1737 return AdaptedAssignment(center, start, end)
1738
1739 - def __init__(self, center, qstart, qend):
1740 1741 if qstart < center.centroid.qstart: 1742 raise ValueError(qstart) 1743 if qend > center.centroid.qend: 1744 raise ValueError(qend) 1745 1746 self._qstart = qstart 1747 self._qend = qend 1748 self._center = center
1749 1750 @property
1751 - def fragment(self):
1752 return self._center.centroid
1753 1754 @property
1755 - def center(self):
1756 return self._center
1757 1758 @property
1759 - def confidence(self):
1760 return self._center.confidence
1761 1762 @property
1763 - def qstart(self):
1764 return self._qstart
1765 1766 @property
1767 - def qend(self):
1768 return self._qend
1769 1770 @property
1771 - def backbone(self):
1772 return self.fragment.backbone_at(self.qstart, self.qend)
1773
1774 - def chain(self, source):
1775 return self.fragment.chain_at(source, self.qstart, self.qend)
1776
1777 - def to_rosetta(self, source):
1778 return self.fragment.to_rosetta(source, self.qstart, self.qend, self.confidence)
1779
1780 -class SmoothFragmentMap(csb.core.AbstractContainer):
1781
1782 - def __init__(self, length, centroids):
1783 1784 if not length > 0: 1785 raise ValueError(length) 1786 1787 self._length = int(length) 1788 self._slots = set(range(1, self._length + 1)) 1789 self._map = {} 1790 1791 centers = list(centroids) 1792 centers.sort(key=lambda i: i.confidence, reverse=True) 1793 1794 for c in centers: 1795 self.assign(c)
1796 1797 @property
1798 - def _children(self):
1799 return self._map
1800
1801 - def assign(self, center):
1802 1803 for r in range(center.centroid.qstart, center.centroid.qend + 1): 1804 if r in self._slots: 1805 self._map[r] = center 1806 self._slots.remove(r)
1807
1808 - def patches(self):
1809 1810 center = None 1811 start = None 1812 end = None 1813 1814 for r in range(1, self._length + 1): 1815 1816 if center is None: 1817 if r in self._map: 1818 center = self._map[r] 1819 start = end = r 1820 else: 1821 center = None 1822 start = end = None 1823 else: 1824 if r in self._map: 1825 if self._map[r] is center: 1826 end = r 1827 else: 1828 yield AdaptedAssignment(center, start, end) 1829 center = self._map[r] 1830 start = end = r 1831 else: 1832 yield AdaptedAssignment(center, start, end) 1833 center = None 1834 start = end = None
1835
1836 1837 -class ResidueEventInfo(object):
1838
1839 - def __init__(self, residue, confidence=0, count=0, confident=True, gap=False, rep=None):
1840 1841 self.residue = residue 1842 self.confidence = confidence 1843 self.confident = confident 1844 self.gap = gap 1845 self.count = count 1846 self.rep = rep
1847 1848 @property
1849 - def rank(self):
1850 return self.residue.rank
1851 1852 @property
1853 - def type(self):
1854 return self.residue.type
1855 1856 @property
1857 - def torsion(self):
1858 if self.rep: 1859 return self.rep.torsion_at(self.rank, self.rank)[0] 1860 else: 1861 return None
1862
1863 1864 -class RosettaFragsetFactory(object):
1865 """ 1866 Simplifies the construction of fragment libraries. 1867 """ 1868
1869 - def __init__(self):
1870 import csb.bio.fragments.rosetta as rosetta 1871 self.rosetta = rosetta
1872
1873 - def make_fragset(self, target):
1874 """ 1875 Build a fragment library given a L{Target} and its L{Assignment}s. 1876 1877 @param target: target protein 1878 @type target: L{Target} 1879 1880 @rtype: L{RosettaFragmentMap} 1881 """ 1882 1883 frag_factory = self.rosetta.RosettaFragment 1884 fragments = list(map(frag_factory.from_object, target.matches)) 1885 #fragments = [ frag_factory.from_object(f) for f in target.matches if f.length >= 6 ] 1886 fragments.sort() 1887 1888 return self.rosetta.RosettaFragmentMap(fragments, target.length)
1889
1890 - def make_chopped(self, fragments, window):
1891 """ 1892 Build a fixed-length fragment library from a list of 1893 variable-length L{Assignment}s. 1894 1895 @param fragments: source fragments 1896 @type fragments: iterable of L{RosettaFragment}s 1897 @param window: fixed-length fragment size (for classic Rosetta: choose 9) 1898 @type window: int 1899 1900 @return: fixed-length fragment library 1901 @rtype: L{RosettaFragmentMap} 1902 """ 1903 1904 frags = [] 1905 1906 for f in fragments: 1907 for qs in range(f.qstart, f.qend - window + 1): 1908 frags.append(f.subregion(qs, qs + window - 1)) 1909 1910 return self.rosetta.RosettaFragmentMap(frags)
1911
1912 - def make_combined(self, target, filling, threshold=0.5, callback=None):
1913 """ 1914 Complement C{target}'s assignments with C{filling} (e.g. rosetta fragments). 1915 The regions to be complemented are determined by calculating the confidence 1916 at each residue (by filtering). 1917 1918 1919 @param target: target protein 1920 @type target: L{Target} 1921 @param filling: additional fragments to place in the low-conf regions 1922 @type filling: L{RosettaFragmentMap} or iterable of L{RosettaFragment} 1923 @param threshold: confidence threshold 1924 @type threshold: float 1925 1926 @return: complemented fragment library 1927 @rtype: L{RosettaFragmentMap} 1928 """ 1929 1930 fragmap = self.make_fragset(target) 1931 covered = set() 1932 1933 for r in target.residues: 1934 1935 if r.assignments.length == 0: 1936 if callback: 1937 callback(ResidueEventInfo(r.native, gap=True)) 1938 continue 1939 1940 cluster = r.filter() 1941 if cluster is None: 1942 if callback: 1943 callback(ResidueEventInfo(r.native, 0, 0, confident=False)) 1944 continue 1945 1946 if cluster.confidence >= threshold: 1947 covered.add(r.native.rank) 1948 confident = True 1949 else: 1950 confident = False 1951 1952 if callback: 1953 callback(ResidueEventInfo(r.native, cluster.confidence, cluster.count, confident)) 1954 1955 for r in target.residues: 1956 if r.native.rank not in covered: # true for gaps and low-conf residues 1957 fragmap.mark_unconfident(r.native.rank) 1958 1959 for frag in filling: 1960 fragmap.complement(frag) 1961 1962 return fragmap
1963
1964 - def make_filtered(self, target, extend=False, callback=None):
1965 """ 1966 Builed a filtered fragment library (by clustering), containing only 1967 representative fragments (cluster centroids). 1968 1969 @param target: target protein 1970 @type target: L{Target} 1971 @param extend: if True, pick alternative reps if available 1972 @type extend: bool 1973 1974 @return: filtered fragment library 1975 @rtype: L{RosettaFragmentMap} 1976 """ 1977 1978 fragments = [] 1979 1980 for r in target.residues: 1981 if r.assignments.length == 0: 1982 if callback: 1983 callback(ResidueEventInfo(r.native, gap=True)) 1984 continue 1985 1986 cluster = r.filter(extend=extend) 1987 if cluster is None: 1988 if callback: 1989 callback(ResidueEventInfo(r.native, 0, 0, confident=False)) 1990 1991 if extend and cluster.has_alternative: 1992 best = cluster.alternative 1993 else: 1994 best = cluster.centroid 1995 1996 fragment = self.rosetta.RosettaFragment.from_object(best) 1997 fragments.append(fragment) 1998 if callback: 1999 callback(ResidueEventInfo(r.native, cluster.confidence, cluster.count, rep=cluster.centroid)) 2000 2001 fragments.sort() 2002 return self.rosetta.RosettaFragmentMap(fragments, target.length)
2003
2004 - def mix(self, *fragsets):
2005 """ 2006 Mix fragments from multiple libraries. 2007 2008 @type fragsets: L{RosettaFragmentMap} 2009 @return: mixed fragment library 2010 @rtype: L{RosettaFragmentMap} 2011 """ 2012 2013 fragments = [] 2014 length = 0 2015 2016 for fragset in fragsets: 2017 if fragset._length > length: 2018 length = fragset._length 2019 2020 for fragment in fragset: 2021 fragments.append(fragment) 2022 2023 return self.rosetta.RosettaFragmentMap(fragments, length)
2024
2025 2026 -class BenchmarkAdapter(object):
2027
2028 - class Connection(object):
2029 2030 FACTORY = None 2031 DSN = None 2032
2033 - def __init__(self, factory=None, dsn=None):
2034 2035 self.factory = factory or self.__class__.FACTORY 2036 self.cs = dsn or self.__class__.DSN 2037 self.connection = None 2038 self.cursor = None
2039
2040 - def __enter__(self):
2041 2042 self.connection = self.factory(self.cs) 2043 try: 2044 self.cursor = self.connection.cursor() 2045 except: 2046 self.connection.close() 2047 raise 2048 return self
2049
2050 - def __exit__(self, *args):
2051 try: 2052 if not self.cursor.closed: 2053 self.cursor.close() 2054 finally: 2055 if not self.connection.closed: 2056 self.connection.close()
2057
2058 - def __init__(self, pdb_paths, connection_string=None, factory=AssignmentFactory()):
2059 2060 self._pdb = pdb_paths 2061 self._connection = None 2062 2063 from csb.bio.io.wwpdb import find, StructureParser 2064 self._parser = StructureParser 2065 self._find = find 2066 self._factory = factory 2067 2068 try: 2069 import psycopg2.extras 2070 except ImportError: 2071 raise RuntimeError('Please install the psycopg2 module first') 2072 2073 if connection_string is None: 2074 connection_string = self.connection_string() 2075 2076 BenchmarkAdapter.Connection.FACTORY = psycopg2.extras.DictConnection 2077 BenchmarkAdapter.Connection.DSN = connection_string
2078 2079 @staticmethod
2080 - def connection_string(database='FragmentBenchmarks', host='', username='', password=''):
2081 2082 fields = ['dbname={0}'.format(database)] 2083 2084 if host: 2085 fields.append('host={0}'.format(host)) 2086 if username: 2087 fields.append('user={0}'.format(username)) 2088 fields.append('password={0}'.format(password)) 2089 2090 return ' '.join(fields)
2091
2092 - def targets(self, benchmark_id):
2093 2094 with BenchmarkAdapter.Connection() as db: 2095 2096 db.cursor.callproc('reporting."GetTargets"', (benchmark_id,)) 2097 return db.cursor.fetchall()
2098
2099 - def target_details(self, target_id):
2100 2101 with BenchmarkAdapter.Connection() as db: 2102 2103 db.cursor.callproc('reporting."GetTargetDetails"', (target_id,)) 2104 return db.cursor.fetchall()
2105
2106 - def assignments(self, target_id, type):
2107 2108 with BenchmarkAdapter.Connection() as db: 2109 2110 db.cursor.callproc('reporting."GetAssignments"', (target_id, type)) 2111 return db.cursor.fetchall()
2112
2113 - def assignments_sec_structure(self, target_id, type):
2114 2115 with BenchmarkAdapter.Connection() as db: 2116 2117 db.cursor.callproc('reporting."GetTargetSecStructureAssignments2"', (target_id, type)) 2118 return db.cursor.fetchall()
2119
2120 - def scores(self, benchmark_id, type):
2121 2122 with BenchmarkAdapter.Connection() as db: 2123 2124 db.cursor.callproc('reporting."GetScores"', (benchmark_id, type)) 2125 return db.cursor.fetchall()
2126
2127 - def centroids(self, benchmark_id):
2128 2129 with BenchmarkAdapter.Connection() as db: 2130 2131 db.cursor.callproc('reporting."GetCentroids"', (benchmark_id,)) 2132 return db.cursor.fetchall()
2133
2134 - def target_segments(self, target_id):
2135 2136 with BenchmarkAdapter.Connection() as db: 2137 2138 db.cursor.callproc('reporting."GetTargetSegments"', (target_id,)) 2139 data = db.cursor.fetchall() 2140 2141 return [ TargetSegment(row['Start'], row['End'], row['Count']) for row in data ]
2142
2143 - def structure(self, accession, chain=None):
2144 2145 pdbfile = self._find(accession, self._pdb) 2146 2147 if not pdbfile and chain: 2148 pdbfile = self._find(accession + chain, self._pdb) 2149 2150 if not pdbfile: 2151 raise IOError('{0} not found here: {1}'.format(accession, self._pdb)) 2152 2153 return self._parser(pdbfile).parse_structure()
2154
2155 - def prediction(self, target_id, type, ss=False):
2156 2157 info = self.target_details(target_id) 2158 if not info: 2159 raise ValueError('No such Target ID in the database: {0}'.format(target_id)) 2160 row = info[0] 2161 2162 id = row["Accession"] 2163 length = float(row["Length"]) 2164 overlap = float(row["MaxOverlap"]) / (length or 1.) 2165 2166 native = self.structure(id[:4], id[4]).chains[id[4]] 2167 segments = self.target_segments(target_id) 2168 target = self._factory.target(id, length, native.residues, overlap, segments) 2169 2170 source = None 2171 2172 for row in self.assignments(target_id, type): 2173 2174 src_accession = row['Source'][:4] 2175 src_chain = row['Source'][4] 2176 2177 if source is None or source.accession != src_accession: 2178 try: 2179 source = self.structure(src_accession, src_chain) 2180 except (IOError, ValueError) as ex: 2181 target.errors.append(ex) 2182 continue 2183 2184 if src_chain == '_': 2185 frag_chain = source.first_chain 2186 else: 2187 frag_chain = source.chains[src_chain] 2188 if not frag_chain.has_torsion: 2189 frag_chain.compute_torsion() 2190 2191 fragment = self._factory.assignment( 2192 source=frag_chain, 2193 start=row['SourceStart'], 2194 end=row['SourceEnd'], 2195 id=row['FragmentName'], 2196 qstart=row['Start'], 2197 qend=row['End'], 2198 probability=row['Probability'], 2199 score=row['Score'], 2200 neff=row['Neff'], 2201 rmsd=row['RMSD'], 2202 tm_score=row['TMScore'], 2203 segment=row['SegmentStart'], 2204 internal_id=row['InternalID']) 2205 2206 target.assign(fragment) 2207 2208 if ss: 2209 self._attach_sec_structure(target, target_id, type) 2210 2211 return target
2212
2213 - def _attach_sec_structure(self, target, target_id, type):
2214 2215 ss = {} 2216 2217 for row in self.assignments_sec_structure(target_id, type): 2218 frag_id, state = row["AssignmentID"], row["DSSP"] 2219 if row[frag_id] not in ss: 2220 ss[frag_id] = [] 2221 2222 ss[frag_id].append(state) 2223 2224 for a in target.matches: 2225 if a.internal_id in ss: 2226 dssp = ''.join(ss[a.internal_id]) 2227 a.secondary_structure = dssp
2228