Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/phylo/spoverlap.py: 4%
112 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1from .evolevents import EvolEvent
3__all__ = ["get_evol_events_from_leaf", "get_evol_events_from_root"]
5def get_evol_events_from_leaf(node, sos_thr=0.0):
6 """ Returns a list of duplication and speciation events in
7 which the current node has been involved. Scanned nodes are
8 also labeled internally as dup=True|False. You can access this
9 labels using the 'node.dup' sintaxis.
11 Method: the algorithm scans all nodes from the given leafName to
12 the root. Nodes are assumed to be duplications when a species
13 overlap is found between its child linages. Method is described
14 more detail in:
16 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
17 T. Genome Biol. 2007;8(6):R109.
18 """
19 # Get the tree's root
20 root = node.root
22 try:
23 outg1, outg2 = root.get_children() # outgroups
24 except ValueError:
25 raise TypeError("Tree is not rooted")
27 smaller_outg = outg1 if len(outg1) < len(outg2) else outg2
29 # Prepare to browse tree from leaf to root
30 all_events = []
31 current = node
32 ref_spcs = node.species
33 sister_leaves = set([])
34 browsed_spcs = set([current.species])
35 browsed_leaves = set([current])
36 # get family Size
37 fSize = sum(1 for n in root.leaves() if n.species == ref_spcs)
39 # Clean previous analysis
40 for n in root.traverse():
41 n.del_prop("evoltype")
43 while current.up:
44 for s in current.get_sisters():
45 for leaf in s.leaves():
46 sister_leaves.add(leaf)
47 # Process sister node only if there is any new sequence.
48 # (previene dupliaciones por nombres repetidos)
49 sister_leaves = sister_leaves.difference(browsed_leaves)
50 if len(sister_leaves)==0:
51 current = current.up
52 continue
53 # Gets species at both sides of event
54 sister_spcs = set([n.species for n in sister_leaves])
55 overlaped_spces = browsed_spcs & sister_spcs
56 all_spcs = browsed_spcs | sister_spcs
57 score = float(len(overlaped_spces))/len(all_spcs)
58 # Creates a new evolEvent
59 event = EvolEvent()
60 event.fam_size = fSize
61 event.seed = node.name
62 # event.e_newick = current.up.get_newick() # high mem usage!!
63 event.sos = score
64 event.outgroup = smaller_outg.name
65 # event.allseqs = set(current.up.get_leaf_names())
66 event.in_seqs = set([n.name for n in browsed_leaves])
67 event.out_seqs = set([n.name for n in sister_leaves])
68 event.inparalogs = set([n.name for n in browsed_leaves if n.species == ref_spcs])
70 # If species overlap: duplication
71 if score > sos_thr:
72 event.node = current.up
73 event.etype = "D"
74 event.outparalogs = set([n.name for n in sister_leaves if n.species == ref_spcs])
75 event.orthologs = set([])
76 current.up.add_prop("evoltype","D")
77 all_events.append(event)
79 # If NO species overlap: speciation
80 elif score <= sos_thr:
81 event.node = current.up
82 event.etype = "S"
83 event.orthologs = set([n.name for n in sister_leaves if n.species != ref_spcs])
84 event.outparalogs = set([])
85 current.up.add_prop("evoltype","S")
86 all_events.append(event)
88 # Updates browsed species
89 browsed_spcs |= sister_spcs
90 browsed_leaves |= sister_leaves
91 sister_leaves = set([])
92 # And keep ascending
93 current = current.up
94 return all_events
96def get_evol_events_from_root(node, sos_thr):
97 """ Returns a list of **all** duplication and speciation
98 events detected after this node. Nodes are assumed to be
99 duplications when a species overlap is found between its child
100 linages. Method is described more detail in:
102 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
103 T. Genome Biol. 2007;8(6):R109.
104 """
106 # Get the tree's root
107 root = node.root
109 # Checks that is actually rooted
110 outgroups = root.get_children()
111 if len(outgroups) != 2:
112 raise TypeError("Tree is not rooted")
114 # Cautch the smaller outgroup (will be stored as the tree outgroup)
115 o1 = set([n.name for n in outgroups[0].leaves()])
116 o2 = set([n.name for n in outgroups[1].leaves()])
119 if len(o2)<len(o1):
120 smaller_outg = outgroups[1]
121 else:
122 smaller_outg = outgroups[0]
124 # Get family size
125 fSize = len( [n for n in root.leaves()] )
127 # Clean data from previous analyses
128 for n in list(root.descendants())+[root]:
129 n.del_prop("evoltype")
131 # Gets Prepared to browse the tree from root to leaves
132 to_visit = []
133 current = root
134 all_events = []
135 while current:
136 # Gets childs and appends them to the To_visit list
137 childs = current.get_children()
138 to_visit += childs
139 if len(childs)>2:
140 raise TypeError("nodes are expected to have two childs.")
141 elif len(childs)==0:
142 pass # leaf
143 else:
144 # Get leaves and species at both sides of event
145 sideA_leaves= set([n for n in childs[0].leaves()])
146 sideB_leaves= set([n for n in childs[1].leaves()])
147 sideA_spcs = set([n.species for n in childs[0].leaves()])
148 sideB_spcs = set([n.species for n in childs[1].leaves()])
149 # Calculates species overlap
150 overlaped_spcs = sideA_spcs & sideB_spcs
151 all_spcs = sideA_spcs | sideB_spcs
152 score = float(len(overlaped_spcs))/len(all_spcs)
154 # Creates a new evolEvent
155 event = EvolEvent()
156 event.fam_size = fSize
157 event.branch_supports = [current.support, current.children[0].support, current.children[1].support]
158 # event.seed = leafName
159 # event.e_newick = current.up.get_newick() # high mem usage!!
160 event.sos = score
161 event.outgroup_spcs = smaller_outg.get_species()
162 event.in_seqs = set([n.name for n in sideA_leaves])
163 event.out_seqs = set([n.name for n in sideB_leaves])
164 event.inparalogs = set([n.name for n in sideA_leaves])
165 # If species overlap: duplication
166 if score >sos_thr:
167 event.node = current
168 event.etype = "D"
169 event.outparalogs = set([n.name for n in sideB_leaves])
170 event.orthologs = set([])
171 current.add_prop("evoltype","D")
172 # If NO species overlap: speciation
173 else:
174 event.node = current
175 event.etype = "S"
176 event.orthologs = set([n.name for n in sideB_leaves])
177 event.outparalogs = set([])
178 current.add_prop("evoltype","S")
180 all_events.append(event)
181 # Keep visiting nodes
182 try:
183 current = to_visit.pop(0)
184 except IndexError:
185 current = None
186 return all_events