Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/evol/parser/codemlparser.py: 7%
199 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1#!/usr/bin/python3
3"""
4Ugly parsers for outfiles of codeml, rst file for sites,
5and main outfile
6"""
8__author__ = "Francois-Jose Serra"
9__email__ = "francois@barrabin.org"
10__licence__ = "GPLv3"
11__version__ = "0.0"
13import re
14from warnings import warn
16def parse_rst(path):
17 '''
18 parse rst files from codeml, all site, branch-site models.
19 return 2 dicts "classes" of sites, and values at each site "sites"
20 '''
21 typ = None
22 classes = {}
23 sites = {}
24 n_classes = {}
25 k = 0
26 i = 0
27 bsa = False
28 path = '/'.join(path.split('/')[:-1]) + '/rst'
29 for line in open(path):
30 # get number of classes of sites
31 if line.startswith ('dN/dS '):
32 k = int(re.sub (r'.* \(K=([0-9]+)\)\n', '\\1', line))
33 continue
34 # get values of omega and proportions
35 if typ is None and \
36 re.match (r'^[a-z]+.*(\d+\.\d{5} *){'+ str(k) +'}', line):
37 var = re.sub (':', '', line.split(' ')[0])
38 if var.startswith ('p'):
39 var = 'proportions'
40 classes[var] = [float(v) for v in re.findall(r'\d+\.\d{5}', line)]
41 continue
42 # parse NEB and BEB tables
43 if '(BEB)' in line :
44 k = int(re.sub(r'.*for (\d+) classes.*\n', '\\1', line))
45 typ = 'BEB'
46 sites[typ] = {}
47 n_classes[typ] = k
48 continue
49 if '(NEB)' in line :
50 k = int(re.sub(r'.*for (\d+) classes.*\n', '\\1', line))
51 typ = 'NEB'
52 sites[typ] = {}
53 n_classes[typ] = k
54 continue
55 # at the end of some BEB/NEB tables:
56 if line.startswith ('Positively '):
57 typ = None
58 # continue if we are not in table
59 if not re.match ('^ *[0-9]+ [A-Z*-] ', line) or typ is None:
60 continue
61 # line to list
62 line = line.replace(' +- ', ' ')
63 line = re.sub ('[()]', '', line.strip()).split()
64 # get amino-acid
65 sites[typ].setdefault ('aa', []).append (line[1])
66 # get site class probability
67 probs = []
68 for i in range(k):
69 probs.append (float (line[2+i]))
70 sites [typ].setdefault('p' + str(i), []).append(float(line[2+i]))
71 sites [typ].setdefault ('pv', []).append (max(probs))
72 # get most likely site class
73 classe = int (line [3 + i])
74 sites[typ].setdefault ('class', []).append(classe)
75 # if there, get omega and error
76 try:
77 sites [typ].setdefault('w' , []).append(float (line [4 + i]))
78 except IndexError:
79 # in this case we are with branch-site A or A1 and we should sum
80 # probabilities of categories 2a and 2b
81 probs = probs[:-2] + [sum(probs[-2:])]
82 sites[typ]['pv'][-1] = max(probs)
83 bsa = True
84 try:
85 sites[typ].setdefault('w' , []).append(
86 classes['foreground w'][classe - 1])
87 except KeyError: # clade models
88 del (sites [typ]['w'])
89 try:
90 sites [typ].setdefault ('se', []).append (float (line [5 + i]))
91 except IndexError:
92 del (sites [typ]['se'])
93 return {'classes': classes,
94 'sites' :sites,
95 'n_classes': {k: n_classes[k] - bsa for k in n_classes}}
98def divide_data(pamout, model):
99 '''
100 for multiple dataset, divide outfile.
101 '''
102 for num in range (1, int (model.properties['params']['ndata'])):
103 model.name = model.name + '_' + str(num)
104 out = open (pamout + '_' + str(num), 'w')
105 copy = False
106 for line in open (pamout):
107 if copy == False and \
108 line.startswith('Data set '+ str (num) + '\n'):
109 copy = True
110 continue
111 if copy == True and \
112 line.startswith('Data set '+ str (num+1) + '\n'):
113 break
114 if copy == True:
115 out.write(line)
116 out.close()
117 if copy == False:
118 warn ('WARNING: seems that you have no multiple dataset here...'\
119 + '\n trying as with only one dataset')
120 if model.typ == 'site':
121 rst = '/'.join (pamout.split('/')[:-1])+'/rst'
122 rstout = open (rst + '_' + str(num), 'w')
123 copy = False
124 for line in open(rst):
125 if copy == False and \
126 re.match('\t' + str (num)+'\n', line) is not None:
127 copy = True
128 continue
129 if copy == True and \
130 re.match('\t' + str (num + 1)+'\n', line) is not None:
131 copy = False
132 if copy == True:
133 rstout.write(line)
134 rstout.close()
135 setattr (model, 'data_' + str (num),
136 parse_paml (pamout + '_' + str(num), model))
137 else:
138 setattr (model, 'data_' + str (num),
139 parse_paml (pamout + '_' + str(num), model))
142def get_ancestor (pamout, model):
143 '''
144 only for fb_ancestor model, retrieves ancestral sequences also
145 from rst file.
146 '''
147 for line in open ('/'.join (pamout.split('/')[:-1])+'/rst'):
148 if line.startswith ('node #'):
149 pamlid, seq = re.sub ('node#([0-9]+)([A-Z]*)\n', '\\1\t\\2',
150 re.sub (' ', '', line)).split ('\t')
151 n = model._tree.get_descendant_by_node_id (int (pamlid))
152 n.add_prop ('nt_sequence', seq)
153 elif line.startswith ('Node #'):
154 pamlid, seq = re.sub ('Node#([0-9]+)([A-Z]*)\n', '\\1\t\\2',
155 re.sub (' ', '', line)).split ('\t')
156 n = model._tree.get_descendant_by_node_id (int (pamlid))
157 n.add_prop ('sequence', seq)
158 elif line.startswith ('Counts of changes at sites'):
159 break
161def parse_paml (pamout, model):
162 '''
163 parser function for codeml files,
164 with values of w,dN,dS etc... dependending of the model
165 tested.
166 '''
167 # if multiple dataset in same file we divide the outfile and model.name+x
168 if not '*' in str (model.properties['params']['ndata']):
169 divide_data (pamout, model)
170 return
171 all_lines = open (pamout).readlines()
172 # if we do not have tree, load it
173 if model._tree is None:
174 from ..evol import EvolTree
175 model._tree = EvolTree (re.findall (r'\(.*\);', ''.join(all_lines))[2])
176 model._tree._label_as_paml()
177 # starts parsing
178 for i, line in enumerate (all_lines):
179 if line == '\n':
180 continue
181 # codon frequency
182 if line.startswith('Codon frequencies under model'):
183 model.stats ['codonFreq'] = []
184 for j in range (16):
185 line = list(map (float, re.findall (r'\d\.\d+', all_lines [i+j+1])))
186 model.stats ['codonFreq'] += [line]
187 continue
188 if line.startswith('Nei & Gojobori 1986'):
189 model.stats ['codonFreq'] = []
190 if 'codonFreq' not in model.stats:
191 continue
192 ######################
193 # start serious staff
194 line = line.rstrip()
195 # lnL and number of parameters
196 if line.startswith ('lnL'):
197 try:
198 line = re.sub (r'.* np: *(\d+)\): +(-\d+\.\d+).*',
199 '\\1 \\2', line)
200 model.stats ['np' ] = int (line.split()[0])
201 model.stats ['lnL'] = float (line.split()[1])
202 except ValueError:
203 line = re.sub (r'.* np: *(\d+)\): +(nan).*',
204 '\\1 \\2', line)
205 model.stats ['np' ] = int (line.split()[0])
206 model.stats ['lnL'] = float ('-inf')
207 continue
208 # get labels of internal branches
209 if line.count('..') >= 2:
210 labels = re.findall (r'\d+\.\.\d+', line + ' ')
211 _check_paml_labels (model._tree, labels, pamout, model)
212 continue
213 # retrieve kappa
214 if line.startswith ('kappa '):
215 try:
216 model.stats ['kappa'] = float (re.sub (r'.*(\d+\.\d+).*',
217 '\\1', line))
218 except ValueError:
219 model.stats ['kappa'] = 'nan'
220 # retrieve dS dN t w N S and if present, errors. from summary table
221 if line.count('..') == 1 and line.startswith (' '):
222 if not re.match (r' +\d+\.\.\d+ +\d+\.\d+ ', line):
223 if re.match (r' +( +\d+\.\d+){8}', all_lines [i+1]):
224 _get_values (model, line.split ()[0]+' '+all_lines [i+1])
225 continue
226 _get_values (model, line)
227 continue
229def _get_values(model, line):
230 '''
231 just to ligther main parser
232 '''
233 vals = line.split()
234 # store values under paml_id
235 paml_id = int (vals[0].split('..')[1])
236 model.branches[paml_id].update({
237 'bL' : float (vals [1]),
238 'N' : float (vals [2]),
239 'S' : float (vals [3]),
240 'w' : float (vals [4]),
241 'dN' : float (vals [5]),
242 'dS' : float (vals [6]),
243 'SEdN': float (vals[vals.index ('dN') + 4]) if 'dN' in line else None,
244 'SEdS': float (vals[vals.index ('dS') + 4]) if 'dS' in line else None
245 })
247def _check_paml_labels (tree, paml_labels, pamout, model):
248 '''
249 * check paml labels
250 Should not be necessary if all codeml is run through ETE.
251 '''
252 try:
253 relations = sorted ([list(map( int, x.split('..'))) for x in paml_labels],
254 key=lambda x: x[1])
255 except IndexError:
256 return
257 # check that labelling corresponds with paml...
258 for rel in relations:
259 try:
260 node = tree.get_descendant_by_node_id(rel[1])
261 if int (node.up.props.get('node_id')) != int (rel[0]):
262 warn('WARNING: labelling does not correspond (bad tree?)!!\n' + \
263 ' Getting them from ' + pamout)
264 _get_labels_from_paml(tree, relations, pamout, model)
265 break
266 except IndexError:
267 # if unable to find one node, relabel the whole tree
268 print(rel)
269 warn ('ERROR: labelling does not correspond!!\n' + \
270 ' Getting them from ' + pamout)
271 _get_labels_from_paml(tree, relations, pamout, model)
273def _get_labels_from_paml (tree, relations, pamout, model):
274 '''
275 in case problem in labelling... and of course it is not my fault...
276 retrieve node_ids from outfile... from relations line.
277 This may occur when loading a model that was run outside ETE.
278 '''
279 from copy import copy
280 old2new = {}
281 # label leaves
282 for line in open (pamout, 'r').readlines():
283 if re.search ('^#[0-9][0-9]*:', line):
284 nam, paml_id = re.sub ('#([0-9]+): (.*)', '\\2 \\1',
285 line.strip()).split()
286 node = (tree & nam)
287 old2new[node.props.get('node_id')] = int(paml_id)
288 node.add_prop('node_id', int(paml_id))
289 if line.startswith('Sums of codon'):
290 break
291 # label the root
292 tree.add_prop ('node_id', int (len (tree) + 1))
293 # label other internal nodes
294 for node in tree.traverse(strategy='postorder'):
295 if node.is_root: continue
296 paml_id = next(filter(lambda x: x[1]==node.props.get('node_id'), relations))[0]
297 old2new[node.up.props.get('node_id')] = paml_id
298 node.up.add_prop('node_id', paml_id)
299 ### change keys in branches dict of model
300 branches = copy(model.branches)
301 for b in model.branches:
302 model.branches[b] = branches[old2new[b]]