Coverage for /home/deng/Projects/ete4/hackathon/ete4/ete4/evol/parser/codemlparser.py: 7%

199 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-03-21 09:19 +0100

1#!/usr/bin/python3 

2 

3""" 

4Ugly parsers for outfiles of codeml, rst file for sites, 

5and main outfile 

6""" 

7 

8__author__ = "Francois-Jose Serra" 

9__email__ = "francois@barrabin.org" 

10__licence__ = "GPLv3" 

11__version__ = "0.0" 

12 

13import re 

14from warnings import warn 

15 

16def parse_rst(path): 

17 ''' 

18 parse rst files from codeml, all site, branch-site models. 

19 return 2 dicts "classes" of sites, and values at each site "sites" 

20 ''' 

21 typ = None 

22 classes = {} 

23 sites = {} 

24 n_classes = {} 

25 k = 0 

26 i = 0 

27 bsa = False 

28 path = '/'.join(path.split('/')[:-1]) + '/rst' 

29 for line in open(path): 

30 # get number of classes of sites 

31 if line.startswith ('dN/dS '): 

32 k = int(re.sub (r'.* \(K=([0-9]+)\)\n', '\\1', line)) 

33 continue 

34 # get values of omega and proportions 

35 if typ is None and \ 

36 re.match (r'^[a-z]+.*(\d+\.\d{5} *){'+ str(k) +'}', line): 

37 var = re.sub (':', '', line.split(' ')[0]) 

38 if var.startswith ('p'): 

39 var = 'proportions' 

40 classes[var] = [float(v) for v in re.findall(r'\d+\.\d{5}', line)] 

41 continue 

42 # parse NEB and BEB tables 

43 if '(BEB)' in line : 

44 k = int(re.sub(r'.*for (\d+) classes.*\n', '\\1', line)) 

45 typ = 'BEB' 

46 sites[typ] = {} 

47 n_classes[typ] = k 

48 continue 

49 if '(NEB)' in line : 

50 k = int(re.sub(r'.*for (\d+) classes.*\n', '\\1', line)) 

51 typ = 'NEB' 

52 sites[typ] = {} 

53 n_classes[typ] = k 

54 continue 

55 # at the end of some BEB/NEB tables: 

56 if line.startswith ('Positively '): 

57 typ = None 

58 # continue if we are not in table 

59 if not re.match ('^ *[0-9]+ [A-Z*-] ', line) or typ is None: 

60 continue 

61 # line to list 

62 line = line.replace(' +- ', ' ') 

63 line = re.sub ('[()]', '', line.strip()).split() 

64 # get amino-acid 

65 sites[typ].setdefault ('aa', []).append (line[1]) 

66 # get site class probability 

67 probs = [] 

68 for i in range(k): 

69 probs.append (float (line[2+i])) 

70 sites [typ].setdefault('p' + str(i), []).append(float(line[2+i])) 

71 sites [typ].setdefault ('pv', []).append (max(probs)) 

72 # get most likely site class 

73 classe = int (line [3 + i]) 

74 sites[typ].setdefault ('class', []).append(classe) 

75 # if there, get omega and error 

76 try: 

77 sites [typ].setdefault('w' , []).append(float (line [4 + i])) 

78 except IndexError: 

79 # in this case we are with branch-site A or A1 and we should sum 

80 # probabilities of categories 2a and 2b 

81 probs = probs[:-2] + [sum(probs[-2:])] 

82 sites[typ]['pv'][-1] = max(probs) 

83 bsa = True 

84 try: 

85 sites[typ].setdefault('w' , []).append( 

86 classes['foreground w'][classe - 1]) 

87 except KeyError: # clade models 

88 del (sites [typ]['w']) 

89 try: 

90 sites [typ].setdefault ('se', []).append (float (line [5 + i])) 

91 except IndexError: 

92 del (sites [typ]['se']) 

93 return {'classes': classes, 

94 'sites' :sites, 

95 'n_classes': {k: n_classes[k] - bsa for k in n_classes}} 

96 

97 

98def divide_data(pamout, model): 

99 ''' 

100 for multiple dataset, divide outfile. 

101 ''' 

102 for num in range (1, int (model.properties['params']['ndata'])): 

103 model.name = model.name + '_' + str(num) 

104 out = open (pamout + '_' + str(num), 'w') 

105 copy = False 

106 for line in open (pamout): 

107 if copy == False and \ 

108 line.startswith('Data set '+ str (num) + '\n'): 

109 copy = True 

110 continue 

111 if copy == True and \ 

112 line.startswith('Data set '+ str (num+1) + '\n'): 

113 break 

114 if copy == True: 

115 out.write(line) 

116 out.close() 

117 if copy == False: 

118 warn ('WARNING: seems that you have no multiple dataset here...'\ 

119 + '\n trying as with only one dataset') 

120 if model.typ == 'site': 

121 rst = '/'.join (pamout.split('/')[:-1])+'/rst' 

122 rstout = open (rst + '_' + str(num), 'w') 

123 copy = False 

124 for line in open(rst): 

125 if copy == False and \ 

126 re.match('\t' + str (num)+'\n', line) is not None: 

127 copy = True 

128 continue 

129 if copy == True and \ 

130 re.match('\t' + str (num + 1)+'\n', line) is not None: 

131 copy = False 

132 if copy == True: 

133 rstout.write(line) 

134 rstout.close() 

135 setattr (model, 'data_' + str (num), 

136 parse_paml (pamout + '_' + str(num), model)) 

137 else: 

138 setattr (model, 'data_' + str (num), 

139 parse_paml (pamout + '_' + str(num), model)) 

140 

141 

142def get_ancestor (pamout, model): 

143 ''' 

144 only for fb_ancestor model, retrieves ancestral sequences also 

145 from rst file. 

146 ''' 

147 for line in open ('/'.join (pamout.split('/')[:-1])+'/rst'): 

148 if line.startswith ('node #'): 

149 pamlid, seq = re.sub ('node#([0-9]+)([A-Z]*)\n', '\\1\t\\2', 

150 re.sub (' ', '', line)).split ('\t') 

151 n = model._tree.get_descendant_by_node_id (int (pamlid)) 

152 n.add_prop ('nt_sequence', seq) 

153 elif line.startswith ('Node #'): 

154 pamlid, seq = re.sub ('Node#([0-9]+)([A-Z]*)\n', '\\1\t\\2', 

155 re.sub (' ', '', line)).split ('\t') 

156 n = model._tree.get_descendant_by_node_id (int (pamlid)) 

157 n.add_prop ('sequence', seq) 

158 elif line.startswith ('Counts of changes at sites'): 

159 break 

160 

161def parse_paml (pamout, model): 

162 ''' 

163 parser function for codeml files, 

164 with values of w,dN,dS etc... dependending of the model 

165 tested. 

166 ''' 

167 # if multiple dataset in same file we divide the outfile and model.name+x 

168 if not '*' in str (model.properties['params']['ndata']): 

169 divide_data (pamout, model) 

170 return 

171 all_lines = open (pamout).readlines() 

172 # if we do not have tree, load it 

173 if model._tree is None: 

174 from ..evol import EvolTree 

175 model._tree = EvolTree (re.findall (r'\(.*\);', ''.join(all_lines))[2]) 

176 model._tree._label_as_paml() 

177 # starts parsing 

178 for i, line in enumerate (all_lines): 

179 if line == '\n': 

180 continue 

181 # codon frequency 

182 if line.startswith('Codon frequencies under model'): 

183 model.stats ['codonFreq'] = [] 

184 for j in range (16): 

185 line = list(map (float, re.findall (r'\d\.\d+', all_lines [i+j+1]))) 

186 model.stats ['codonFreq'] += [line] 

187 continue 

188 if line.startswith('Nei & Gojobori 1986'): 

189 model.stats ['codonFreq'] = [] 

190 if 'codonFreq' not in model.stats: 

191 continue 

192 ###################### 

193 # start serious staff 

194 line = line.rstrip() 

195 # lnL and number of parameters 

196 if line.startswith ('lnL'): 

197 try: 

198 line = re.sub (r'.* np: *(\d+)\): +(-\d+\.\d+).*', 

199 '\\1 \\2', line) 

200 model.stats ['np' ] = int (line.split()[0]) 

201 model.stats ['lnL'] = float (line.split()[1]) 

202 except ValueError: 

203 line = re.sub (r'.* np: *(\d+)\): +(nan).*', 

204 '\\1 \\2', line) 

205 model.stats ['np' ] = int (line.split()[0]) 

206 model.stats ['lnL'] = float ('-inf') 

207 continue 

208 # get labels of internal branches 

209 if line.count('..') >= 2: 

210 labels = re.findall (r'\d+\.\.\d+', line + ' ') 

211 _check_paml_labels (model._tree, labels, pamout, model) 

212 continue 

213 # retrieve kappa 

214 if line.startswith ('kappa '): 

215 try: 

216 model.stats ['kappa'] = float (re.sub (r'.*(\d+\.\d+).*', 

217 '\\1', line)) 

218 except ValueError: 

219 model.stats ['kappa'] = 'nan' 

220 # retrieve dS dN t w N S and if present, errors. from summary table 

221 if line.count('..') == 1 and line.startswith (' '): 

222 if not re.match (r' +\d+\.\.\d+ +\d+\.\d+ ', line): 

223 if re.match (r' +( +\d+\.\d+){8}', all_lines [i+1]): 

224 _get_values (model, line.split ()[0]+' '+all_lines [i+1]) 

225 continue 

226 _get_values (model, line) 

227 continue 

228 

229def _get_values(model, line): 

230 ''' 

231 just to ligther main parser 

232 ''' 

233 vals = line.split() 

234 # store values under paml_id 

235 paml_id = int (vals[0].split('..')[1]) 

236 model.branches[paml_id].update({ 

237 'bL' : float (vals [1]), 

238 'N' : float (vals [2]), 

239 'S' : float (vals [3]), 

240 'w' : float (vals [4]), 

241 'dN' : float (vals [5]), 

242 'dS' : float (vals [6]), 

243 'SEdN': float (vals[vals.index ('dN') + 4]) if 'dN' in line else None, 

244 'SEdS': float (vals[vals.index ('dS') + 4]) if 'dS' in line else None 

245 }) 

246 

247def _check_paml_labels (tree, paml_labels, pamout, model): 

248 ''' 

249 * check paml labels 

250 Should not be necessary if all codeml is run through ETE. 

251 ''' 

252 try: 

253 relations = sorted ([list(map( int, x.split('..'))) for x in paml_labels], 

254 key=lambda x: x[1]) 

255 except IndexError: 

256 return 

257 # check that labelling corresponds with paml... 

258 for rel in relations: 

259 try: 

260 node = tree.get_descendant_by_node_id(rel[1]) 

261 if int (node.up.props.get('node_id')) != int (rel[0]): 

262 warn('WARNING: labelling does not correspond (bad tree?)!!\n' + \ 

263 ' Getting them from ' + pamout) 

264 _get_labels_from_paml(tree, relations, pamout, model) 

265 break 

266 except IndexError: 

267 # if unable to find one node, relabel the whole tree 

268 print(rel) 

269 warn ('ERROR: labelling does not correspond!!\n' + \ 

270 ' Getting them from ' + pamout) 

271 _get_labels_from_paml(tree, relations, pamout, model) 

272 

273def _get_labels_from_paml (tree, relations, pamout, model): 

274 ''' 

275 in case problem in labelling... and of course it is not my fault... 

276 retrieve node_ids from outfile... from relations line. 

277 This may occur when loading a model that was run outside ETE. 

278 ''' 

279 from copy import copy 

280 old2new = {} 

281 # label leaves 

282 for line in open (pamout, 'r').readlines(): 

283 if re.search ('^#[0-9][0-9]*:', line): 

284 nam, paml_id = re.sub ('#([0-9]+): (.*)', '\\2 \\1', 

285 line.strip()).split() 

286 node = (tree & nam) 

287 old2new[node.props.get('node_id')] = int(paml_id) 

288 node.add_prop('node_id', int(paml_id)) 

289 if line.startswith('Sums of codon'): 

290 break 

291 # label the root 

292 tree.add_prop ('node_id', int (len (tree) + 1)) 

293 # label other internal nodes 

294 for node in tree.traverse(strategy='postorder'): 

295 if node.is_root: continue 

296 paml_id = next(filter(lambda x: x[1]==node.props.get('node_id'), relations))[0] 

297 old2new[node.up.props.get('node_id')] = paml_id 

298 node.up.add_prop('node_id', paml_id) 

299 ### change keys in branches dict of model 

300 branches = copy(model.branches) 

301 for b in model.branches: 

302 model.branches[b] = branches[old2new[b]]