Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/phylo/spoverlap.py: 4%

112 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1from .evolevents import EvolEvent 

2 

3__all__ = ["get_evol_events_from_leaf", "get_evol_events_from_root"] 

4 

5def get_evol_events_from_leaf(node, sos_thr=0.0): 

6 """ Returns a list of duplication and speciation events in 

7 which the current node has been involved. Scanned nodes are 

8 also labeled internally as dup=True|False. You can access this 

9 labels using the 'node.dup' sintaxis. 

10 

11 Method: the algorithm scans all nodes from the given leafName to 

12 the root. Nodes are assumed to be duplications when a species 

13 overlap is found between its child linages. Method is described 

14 more detail in: 

15 

16 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon 

17 T. Genome Biol. 2007;8(6):R109. 

18 """ 

19 # Get the tree's root 

20 root = node.root 

21 

22 try: 

23 outg1, outg2 = root.get_children() # outgroups 

24 except ValueError: 

25 raise TypeError("Tree is not rooted") 

26 

27 smaller_outg = outg1 if len(outg1) < len(outg2) else outg2 

28 

29 # Prepare to browse tree from leaf to root 

30 all_events = [] 

31 current = node 

32 ref_spcs = node.species 

33 sister_leaves = set([]) 

34 browsed_spcs = set([current.species]) 

35 browsed_leaves = set([current]) 

36 # get family Size 

37 fSize = sum(1 for n in root.leaves() if n.species == ref_spcs) 

38 

39 # Clean previous analysis 

40 for n in root.traverse(): 

41 n.del_prop("evoltype") 

42 

43 while current.up: 

44 for s in current.get_sisters(): 

45 for leaf in s.leaves(): 

46 sister_leaves.add(leaf) 

47 # Process sister node only if there is any new sequence. 

48 # (previene dupliaciones por nombres repetidos) 

49 sister_leaves = sister_leaves.difference(browsed_leaves) 

50 if len(sister_leaves)==0: 

51 current = current.up 

52 continue 

53 # Gets species at both sides of event 

54 sister_spcs = set([n.species for n in sister_leaves]) 

55 overlaped_spces = browsed_spcs & sister_spcs 

56 all_spcs = browsed_spcs | sister_spcs 

57 score = float(len(overlaped_spces))/len(all_spcs) 

58 # Creates a new evolEvent 

59 event = EvolEvent() 

60 event.fam_size = fSize 

61 event.seed = node.name 

62 # event.e_newick = current.up.get_newick() # high mem usage!! 

63 event.sos = score 

64 event.outgroup = smaller_outg.name 

65 # event.allseqs = set(current.up.get_leaf_names()) 

66 event.in_seqs = set([n.name for n in browsed_leaves]) 

67 event.out_seqs = set([n.name for n in sister_leaves]) 

68 event.inparalogs = set([n.name for n in browsed_leaves if n.species == ref_spcs]) 

69 

70 # If species overlap: duplication 

71 if score > sos_thr: 

72 event.node = current.up 

73 event.etype = "D" 

74 event.outparalogs = set([n.name for n in sister_leaves if n.species == ref_spcs]) 

75 event.orthologs = set([]) 

76 current.up.add_prop("evoltype","D") 

77 all_events.append(event) 

78 

79 # If NO species overlap: speciation 

80 elif score <= sos_thr: 

81 event.node = current.up 

82 event.etype = "S" 

83 event.orthologs = set([n.name for n in sister_leaves if n.species != ref_spcs]) 

84 event.outparalogs = set([]) 

85 current.up.add_prop("evoltype","S") 

86 all_events.append(event) 

87 

88 # Updates browsed species 

89 browsed_spcs |= sister_spcs 

90 browsed_leaves |= sister_leaves 

91 sister_leaves = set([]) 

92 # And keep ascending 

93 current = current.up 

94 return all_events 

95 

96def get_evol_events_from_root(node, sos_thr): 

97 """ Returns a list of **all** duplication and speciation 

98 events detected after this node. Nodes are assumed to be 

99 duplications when a species overlap is found between its child 

100 linages. Method is described more detail in: 

101 

102 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon 

103 T. Genome Biol. 2007;8(6):R109. 

104 """ 

105 

106 # Get the tree's root 

107 root = node.root 

108 

109 # Checks that is actually rooted 

110 outgroups = root.get_children() 

111 if len(outgroups) != 2: 

112 raise TypeError("Tree is not rooted") 

113 

114 # Cautch the smaller outgroup (will be stored as the tree outgroup) 

115 o1 = set([n.name for n in outgroups[0].leaves()]) 

116 o2 = set([n.name for n in outgroups[1].leaves()]) 

117 

118 

119 if len(o2)<len(o1): 

120 smaller_outg = outgroups[1] 

121 else: 

122 smaller_outg = outgroups[0] 

123 

124 # Get family size 

125 fSize = len( [n for n in root.leaves()] ) 

126 

127 # Clean data from previous analyses 

128 for n in list(root.descendants())+[root]: 

129 n.del_prop("evoltype") 

130 

131 # Gets Prepared to browse the tree from root to leaves 

132 to_visit = [] 

133 current = root 

134 all_events = [] 

135 while current: 

136 # Gets childs and appends them to the To_visit list 

137 childs = current.get_children() 

138 to_visit += childs 

139 if len(childs)>2: 

140 raise TypeError("nodes are expected to have two childs.") 

141 elif len(childs)==0: 

142 pass # leaf 

143 else: 

144 # Get leaves and species at both sides of event 

145 sideA_leaves= set([n for n in childs[0].leaves()]) 

146 sideB_leaves= set([n for n in childs[1].leaves()]) 

147 sideA_spcs = set([n.species for n in childs[0].leaves()]) 

148 sideB_spcs = set([n.species for n in childs[1].leaves()]) 

149 # Calculates species overlap 

150 overlaped_spcs = sideA_spcs & sideB_spcs 

151 all_spcs = sideA_spcs | sideB_spcs 

152 score = float(len(overlaped_spcs))/len(all_spcs) 

153 

154 # Creates a new evolEvent 

155 event = EvolEvent() 

156 event.fam_size = fSize 

157 event.branch_supports = [current.support, current.children[0].support, current.children[1].support] 

158 # event.seed = leafName 

159 # event.e_newick = current.up.get_newick() # high mem usage!! 

160 event.sos = score 

161 event.outgroup_spcs = smaller_outg.get_species() 

162 event.in_seqs = set([n.name for n in sideA_leaves]) 

163 event.out_seqs = set([n.name for n in sideB_leaves]) 

164 event.inparalogs = set([n.name for n in sideA_leaves]) 

165 # If species overlap: duplication 

166 if score >sos_thr: 

167 event.node = current 

168 event.etype = "D" 

169 event.outparalogs = set([n.name for n in sideB_leaves]) 

170 event.orthologs = set([]) 

171 current.add_prop("evoltype","D") 

172 # If NO species overlap: speciation 

173 else: 

174 event.node = current 

175 event.etype = "S" 

176 event.orthologs = set([n.name for n in sideB_leaves]) 

177 event.outparalogs = set([]) 

178 current.add_prop("evoltype","S") 

179 

180 all_events.append(event) 

181 # Keep visiting nodes 

182 try: 

183 current = to_visit.pop(0) 

184 except IndexError: 

185 current = None 

186 return all_events