Coverage for /home/deng/Projects/metatree_drawer/treeprofiler_algo/pastml/pastml/acr.py: 9%
492 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
« prev ^ index » next coverage.py v7.2.7, created at 2024-03-21 09:19 +0100
1import logging
2import os
3import warnings
4from collections import defaultdict, Counter
5from multiprocessing.pool import ThreadPool
7import numpy as np
8import pandas as pd
9from Bio.Phylo import NewickIO, write
10from Bio.Phylo.NewickIO import StringIO
11from ete4 import Tree
13from pastml import col_name2cat, value2list, STATES, METHOD, CHARACTER, get_personalized_feature_name, numeric2datetime, \
14 datetime2numeric
15from pastml.annotation import preannotate_forest, ForestStats
16from pastml.file import get_combined_ancestral_state_file, get_named_tree_file, get_pastml_parameter_file, \
17 get_pastml_marginal_prob_file, get_pastml_work_dir
18from pastml.ml import MARGINAL_PROBABILITIES, is_ml, is_marginal, MPPA, ml_acr, \
19 ML_METHODS, MAP, JOINT, ALL, ML, META_ML_METHODS, MARGINAL_ML_METHODS, get_default_ml_method
20from pastml.models import MODEL, SCALING_FACTOR, SMOOTHING_FACTOR
21from pastml.models.CustomRatesModel import CustomRatesModel, CUSTOM_RATES
22from pastml.models.EFTModel import EFTModel, EFT
23from pastml.models.F81Model import F81Model, F81
24from pastml.models.HKYModel import HKYModel, HKY, HKY_STATES
25from pastml.models.JCModel import JCModel, JC
26from pastml.models.JTTModel import JTTModel, JTT, JTT_STATES
27from pastml.parsimony import is_parsimonious, parsimonious_acr, ACCTRAN, DELTRAN, DOWNPASS, MP_METHODS, MP, \
28 get_default_mp_method
29from pastml.tree import name_tree, annotate_dates, DATE, read_forest, DATE_CI, resolve_trees, IS_POLYTOMY, \
30 unresolve_trees, clear_extra_features
31from pastml.visualisation import get_formatted_date
32from pastml.visualisation.cytoscape_manager import visualize, TIMELINE_SAMPLED, TIMELINE_NODES, TIMELINE_LTT, \
33 DIST_TO_ROOT_LABEL, DATE_LABEL
34from pastml.visualisation.itol_manager import generate_itol_annotations
35from pastml.visualisation.tree_compressor import REASONABLE_NUMBER_OF_TIPS, VERTICAL, HORIZONTAL, TRIM
37PASTML_VERSION = '1.9.42'
39model2class = {F81: F81Model, JC: JCModel, CUSTOM_RATES: CustomRatesModel, HKY: HKYModel, JTT: JTTModel, EFT: EFTModel}
41warnings.filterwarnings("ignore", append=True)
43COPY = 'COPY'
46def _serialize_acr(args):
47 acr_result, work_dir = args
48 out_param_file = \
49 os.path.join(work_dir,
50 get_pastml_parameter_file(method=acr_result[METHOD],
51 model=acr_result[MODEL].name if MODEL in acr_result else None,
52 column=acr_result[CHARACTER]))
54 # Not using DataFrames to speed up document writing
55 with open(out_param_file, 'w+') as f:
56 f.write('parameter\tvalue\n')
57 f.write('pastml_version\t{}\n'.format(PASTML_VERSION))
58 for name in sorted(acr_result.keys()):
59 if name not in [STATES, MARGINAL_PROBABILITIES, METHOD, MODEL]:
60 f.write('{}\t{}\n'.format(name, acr_result[name]))
61 f.write('{}\t{}\n'.format(METHOD, acr_result[METHOD]))
62 if is_ml(acr_result[METHOD]):
63 acr_result[MODEL].save_parameters(f)
64 logging.getLogger('pastml').debug('Serialized ACR parameters and statistics for {} to {}.'
65 .format(acr_result[CHARACTER], out_param_file))
67 if is_marginal(acr_result[METHOD]):
68 out_mp_file = \
69 os.path.join(work_dir,
70 get_pastml_marginal_prob_file(method=acr_result[METHOD], model=acr_result[MODEL].name,
71 column=acr_result[CHARACTER]))
72 acr_result[MARGINAL_PROBABILITIES].to_csv(out_mp_file, sep='\t', index_label='node')
73 logging.getLogger('pastml').debug('Serialized marginal probabilities for {} to {}.'
74 .format(acr_result[CHARACTER], out_mp_file))
77def acr(forest, df=None, columns=None, column2states=None, prediction_method=MPPA, model=F81,
78 column2parameters=None, column2rates=None,
79 force_joint=True, threads=0,
80 reoptimise=False, tau=0, resolve_polytomies=False, frequency_smoothing=False):
81 """
82 Reconstructs ancestral states for the given tree and
83 all the characters specified as columns of the given annotation dataframe.
85 :param df: dataframe indexed with tree node names
86 and containing characters for which ACR should be performed as columns.
87 :type df: pandas.DataFrame
88 :param forest: tree or list of trees whose ancestral states are to be reconstructed.
89 :type forest: ete3.Tree or list(ete3.Tree)
90 :param model: (optional, default is F81) model(s) to be used by PASTML,
91 can be either one model to be used for all the characters,
92 or a list of different models (in the same order as the annotation dataframe columns)
93 :type model: str or list(str)
94 :param prediction_method: (optional, default is MPPA) ancestral state prediction method(s) to be used by PASTML,
95 can be either one method to be used for all the characters,
96 or a list of different methods (in the same order as the annotation dataframe columns)
97 :type prediction_method: str or list(str)
98 :param column2parameters: an optional way to fix some parameters,
99 must be in a form {column: {param: value}},
100 where param can be a character state (then the value should specify its frequency between 0 and 1),
101 or pastml.ml.SCALING_FACTOR (then the value should be the scaling factor for three branches,
102 e.g. set to 1 to keep the original branches). Could also be in a form {column: path_to_param_file}.
103 :type column2parameters: dict
104 :param reoptimise: (False by default) if set to True and the parameters are specified,
105 they will be considered as an optimisation starting point instead, and the parameters will be optimised.
106 :type reoptimise: bool
107 :param force_joint: (optional, default is True) whether the JOINT state should be added to the MPPA prediction
108 even when not selected by the Brier score
109 :type force_joint: bool
110 :param tau: a smoothing factor to apply to branch lengths during likelihood calculation.
111 If set to zero (default), zero internal branches will be collapsed instead.
112 :type tau: float
114 :param threads: (optional, default is 0, which stands for automatic) number of threads PastML can use for parallezation.
115 By default, detected automatically based on the system. Note that PastML will at most use as many threads
116 as the number of characters (-c option) being analysed plus one.
117 :type threads: int
119 :return: list of ACR result dictionaries, one per character.
120 :rtype: list(dict)
121 """
123 logger = logging.getLogger('pastml')
124 if isinstance(forest, Tree):
125 forest = [forest]
127 if columns is None:
128 if df is None:
129 raise ValueError('Either the tree should be preannotated with character values '
130 'and columns and column2states specified, '
131 'or an annotation dataframe provided!')
132 columns = df.columns
133 column2states = {column: np.array(sorted([_ for _ in df[column].unique() if not pd.isna(_) and '' != _]))
134 for column in columns}
135 preannotate_forest(forest, df=df)
137 forest_stats = ForestStats(forest)
138 logging.getLogger('pastml').debug('\n=============ACR===============================')
140 column2parameters = column2parameters if column2parameters else {}
141 column2rates = column2rates if column2rates else {}
143 prediction_methods = value2list(len(columns), prediction_method, MPPA)
145 models = value2list(len(columns), model, F81)
147 def get_states(method, model, column):
148 initial_states = column2states[column]
149 if not is_ml(method) or model not in {HKY, JTT}:
150 return initial_states
151 states = HKY_STATES if HKY == model else JTT_STATES
152 if not set(initial_states) & set(states):
153 raise ValueError('The allowed states for model {} are {}, '
154 'but your annotation file specifies {} as states in column {}.'
155 .format(model, ', '.join(states), ', '.join(initial_states), column))
156 state_set = set(states)
157 for root in forest:
158 for n in root.traverse():
159 if column in n.props.keys():
160 n.add_prop(column, state_set & n.props.get(column))
161 return states
163 # If we gonna resolve polytomies we might need to get back to the initial states so let's memorise them
164 n2c2states = defaultdict(dict)
165 for root in forest:
166 for n in root.traverse():
167 for c in columns:
168 vs = n.props.get(c, set())
169 if vs:
170 n2c2states[n][c] = vs
172 # method, model, states, params, optimise
173 character2settings = {}
174 for (character, prediction_method, model) in zip(columns, prediction_methods, models):
175 logging.getLogger('pastml') \
176 .debug('ACR settings for {}:\n\tMethod:\t{}{}.'
177 .format(character, prediction_method,
178 '\n\tModel:\t{}'.format(model) if model and is_ml(prediction_method) else ''))
179 if COPY == prediction_method or is_parsimonious(prediction_method):
180 states = get_states(prediction_method, model, character)
181 character2settings[character] = [prediction_method, states]
182 elif is_ml(prediction_method):
183 params = column2parameters[character] if character in column2parameters else None
184 rate_file = column2rates[character] if character in column2rates else None
185 optimise_tau = tau is None or reoptimise
186 if tau is None:
187 tau = 0
188 states = get_states(prediction_method, model, character)
190 missing_data, observed_frequencies, state2index = calculate_observed_freqs(character, forest, states)
192 logger = logging.getLogger('pastml')
193 logger.debug('Observed frequencies for {}:{}{}.'
194 .format(character,
195 ''.join('\n\tfrequency of {}:\t{:.6f}'
196 .format(state, observed_frequencies[state2index[state]]) for state in states),
197 '\n\tfraction of missing data:\t{:.6f}'
198 .format(missing_data) if missing_data else '')
199 )
201 model_instance = model2class[model](parameter_file=params, rate_matrix_file=rate_file, reoptimise=reoptimise,
202 frequency_smoothing=frequency_smoothing, tau=tau,
203 optimise_tau=optimise_tau, states=states, forest_stats=forest_stats,
204 observed_frequencies=observed_frequencies)
205 character2settings[character] = [prediction_method, model_instance]
206 #print(character2settings) #{'State': ['MPPA', <pastml.models.F81Model.F81Model object at 0x7f54379978e0>]}
207 else:
208 raise ValueError('Method {} is unknown, should be one of ML ({}), one of MP ({}) or {}'
209 .format(prediction_method, ', '.join(ML_METHODS), ', '.join(MP_METHODS), COPY))
210 if threads < 1:
211 threads = max(os.cpu_count(), 1)
213 def _work(character):
214 # annotation
215 prediction_method, model_or_states = character2settings[character]
216 if COPY == prediction_method:
217 return {CHARACTER: character, STATES: model_or_states, METHOD: prediction_method}
218 if is_ml(prediction_method):
219 return ml_acr(forest=forest, character=character, prediction_method=prediction_method,
220 model=model_or_states,
221 force_joint=force_joint, observed_frequencies=observed_frequencies)
222 if is_parsimonious(prediction_method):
223 return parsimonious_acr(forest=forest, character=character, prediction_method=prediction_method,
224 states=model_or_states,
225 num_nodes=forest_stats.num_nodes, num_tips=forest_stats.num_tips)
227 if threads > 1:
228 with ThreadPool(processes=threads - 1) as pool:
229 acr_results = \
230 pool.map(func=_work, iterable=character2settings.keys())
231 else:
232 acr_results = [_work(character) for character in character2settings.keys()]
234 acr_results = flatten_lists(acr_results)
236 column2states = {acr_result[CHARACTER]: acr_result[STATES] for acr_result in acr_results}
237 column2copy = {acr_result[CHARACTER]: acr_result[METHOD] == COPY for acr_result in acr_results}
238 if resolve_polytomies and resolve_trees(column2states, forest):
239 level = logger.level
240 logger.setLevel(logging.ERROR)
241 # we have selected states before, so now need to reset them
242 for root in forest:
243 for n in root.traverse():
244 c2states = n2c2states[n]
245 for c in columns:
246 if c in c2states:
247 n.add_prop(c, c2states[c])
248 # if it is a copy method we just need to keep the polytomy state
249 # as there is no way to calculate a state
250 elif not n.props.get(IS_POLYTOMY, False) or not column2copy[c]:
251 n.del_prop(c)
253 forest_stats = ForestStats(forest)
254 for acr_res in acr_results:
255 character = acr_res[CHARACTER]
256 method = acr_res[METHOD]
257 if is_ml(method):
258 if character not in character2settings:
259 character = character[:character.rfind('_{}').format(method)]
260 character2settings[character][1].freeze()
261 character2settings[character][1].forest_stats = forest_stats
262 if threads > 1:
263 with ThreadPool(processes=threads - 1) as pool:
264 acr_results = \
265 pool.map(func=_work, iterable=character2settings.keys())
266 else:
267 acr_results = [_work(character) for character in character2settings.keys()]
268 logger.setLevel(level)
269 while unresolve_trees(column2states, forest):
270 logger.setLevel(logging.ERROR)
271 if threads > 1:
272 with ThreadPool(processes=threads - 1) as pool:
273 acr_results = \
274 pool.map(func=_work, iterable=character2settings.keys())
275 else:
276 acr_results = [_work(character) for character in character2settings.keys()]
277 logger.setLevel(level)
278 logger.setLevel(level)
279 acr_results = flatten_lists(acr_results)
280 return acr_results
283# def calculate_observed_freqs(character, forest, states):
284# n = len(states)
285# missing_data = 0.
286# state2index = dict(zip(states, range(n)))
287# observed_frequencies = np.zeros(n, np.float64)
288# for tree in forest:
289# for _ in tree:
290# state = _.props.get(character, set())
291# if state:
292# num_node_states = len(state)
293# for _ in state:
294# observed_frequencies[state2index[_]] += 1. / num_node_states
295# else:
296# missing_data += 1
297# total_count = observed_frequencies.sum() + missing_data
298# observed_frequencies /= observed_frequencies.sum()
299# missing_data /= total_count
300# return missing_data, observed_frequencies, state2index
302def calculate_observed_freqs(character, forest, states):
303 n = len(states)
304 missing_data = 0.
305 state2index = dict(zip(states, range(n)))
306 observed_frequencies = np.zeros(n, np.float64)
307 for tree in forest:
308 for node in tree:
309 state = node.props.get(character, set())
310 # Check if state is a string and convert it to a set
311 if isinstance(state, str):
312 state = {state}
314 if state:
315 num_node_states = len(state)
316 for st in state:
317 observed_frequencies[state2index[st]] += 1. / num_node_states
318 else:
319 missing_data += 1
321 total_count = observed_frequencies.sum() + missing_data
322 observed_frequencies /= observed_frequencies.sum()
323 missing_data /= total_count
324 return missing_data, observed_frequencies, state2index
326def flatten_lists(lists):
327 result = []
328 for _ in lists:
329 if isinstance(_, list):
330 result.extend(_)
331 else:
332 result.append(_)
333 return result
336def _quote(str_list):
337 return ', '.join('"{}"'.format(_) for _ in str_list) if str_list is not None else ''
340def pastml_pipeline(tree, data=None, data_sep='\t', id_index=0,
341 columns=None, prediction_method=MPPA, model=F81,
342 parameters=None, rate_matrix=None,
343 name_column=None, root_date=None, timeline_type=TIMELINE_SAMPLED,
344 tip_size_threshold=REASONABLE_NUMBER_OF_TIPS, colours=None,
345 out_data=None, html_compressed=None, html=None, html_mixed=None, work_dir=None,
346 verbose=False, forced_joint=False, upload_to_itol=False, itol_id=None, itol_project=None,
347 itol_tree_name=None, offline=False, threads=0, reoptimise=False, focus=None,
348 resolve_polytomies=False, smoothing=False, frequency_smoothing=False,
349 pajek=None, pajek_timing=VERTICAL):
350 """
351 Applies PastML to the given tree(s) with the specified states and visualises the result (as html maps).
353 :param tree: path to the input tree(s) in newick format (must be rooted).
354 :type tree: str
356 :param data: (optional) path to the annotation file in tab/csv format with the first row containing the column names.
357 If not given, the annotations should be contained in the tree file itself.
358 :type data: str
359 :param data_sep: (optional, by default '\t') column separator for the annotation table.
360 By default is set to tab, i.e. for tab-delimited file. Set it to ',' if your file is csv.
361 :type data_sep: char
362 :param id_index: (optional, by default is 0) index of the column in the annotation table
363 that contains the tree tip names, indices start from zero.
364 :type id_index: int
366 :param columns: (optional) name(s) of the annotation table column(s) that contain character(s)
367 to be analysed. If not specified all annotation table columns will be considered.
368 :type columns: str or list(str)
369 :param prediction_method: (optional, default is pastml.ml.MPPA) ancestral character reconstruction method(s),
370 can be one of the max likelihood (ML) methods: pastml.ml.MPPA, pastml.ml.MAP, pastml.ml.JOINT,
371 one of the max parsimony (MP) methods: pastml.parsimony.ACCTRAN, pastml.parsimony.DELTRAN,
372 pastml.parsimony.DOWNPASS; or pastml.acr.COPY to keep the annotated character states as-is without inference.
373 One can also specify one of the meta-methods: pastml.ml.ALL, pastml.ml.ML, pastml.parsimony.MP,
374 that would perform ACR with multiple methods (all of them for pastml.ml.ALL,
375 all the ML methods for pastml.ml.ML, or all the MP methods for pastml.parsimony.MP)
376 and save/visualise the results as multiple characters suffixed with the corresponding method.
377 When multiple ancestral characters are specified (with ``columns`` argument),
378 the same method can be used for all of them (if only one method is specified),
379 or different methods can be used (specified in the same order as ``columns``).
380 If multiple methods are given, but not for all the characters,
381 for the rest of them the default method (pastml.ml.MPPA) is chosen.'
382 :type prediction_method: str or list(str)
383 :param forced_joint: (optional, default is False) add JOINT state to the MPPA state selection
384 even if it is not selected by Brier score.
385 :type forced_joint: bool
386 :param model: (optional, default is pastml.models.f81_like.F81) evolutionary model(s) for ML methods
387 (ignored by MP methods).
388 When multiple ancestral characters are specified (with ``columns`` argument),
389 the same model can be used for all of them (if only one model is specified),
390 or different models can be used (specified in the same order as ``columns``).
391 If multiple models are given, but not for all the characters,
392 for the rest of them the default model (pastml.models.f81_like.F81) is chosen.
393 :type model: str or list(str)
394 :param parameters: optional way to fix some of the ML-method parameters.
395 Could be specified as
396 (1a) a dict {column: {param: value}},
397 where column corresponds to the character for which these parameters should be used,
398 or (1b) in a form {column: path_to_param_file};
399 or (2) as a list of paths to parameter files
400 (in the same order as ``columns`` argument that specifies characters)
401 possibly given only for the first few characters;
402 or (3) as a path to parameter file (only for the first character).
403 Each file should be tab-delimited, with two columns: the first one containing parameter names,
404 and the second, named "value", containing parameter values.
405 Parameters can include character state frequencies (parameter name should be the corresponding state,
406 and parameter value - the float frequency value, between 0 and 1),
407 tree branch scaling factor (parameter name pastml.ml.SCALING_FACTOR),
408 and tree branch smoothing factor (parameter name pastml.ml.SMOOTHING_FACTOR).
409 :type parameters: str or list(str) or dict
410 :param rate_matrix: (only for pastml.models.rate_matrix.CUSTOM_RATES model) path to the file(s)
411 specifying the rate matrix(ces).
412 Could be specified as
413 (1) a dict {column: path_to_file},
414 where column corresponds to the character for which this rate matrix should be used,
415 or (2) as a list of paths to rate matrix files
416 (in the same order as ``columns`` argument that specifies characters)
417 possibly given only for the first few characters;
418 or (3) as a path to rate matrix file (only for the first character).
419 The rate matrix file should specify character states in its first line, preceded by '# ' and separated by spaces.
420 The following lines should contain a symmetric squared rate matrix with positive rates
421 (and zeros on the diagonal), separated by spaces,
422 in the same order at the character states specified in the first line.
423 For example for four states, A, C, G, T and the rates A<->C 1, A<->G 4, A<->T 1, C<->G 1, C<->T 4, G<->T 1,
424 the rate matrix file would look like:
425 # A C G T
426 0 1 4 1
427 1 0 1 4
428 4 1 0 1
429 1 4 1 0
430 :type rate_matrix: str or list(str) or dict
431 :param reoptimise: (False by default) if set to True and the parameters are specified,
432 they will be considered as an optimisation starting point instead, and optimised.
433 :type reoptimise: bool
434 :param smoothing: (optional, default is False) apply a smoothing factor (optimised) to branch lengths
435 during likelihood calculation.
436 :type smoothing: bool
437 :param frequency_smoothing: (optional, default is False) apply a smoothing factor (optimised) to state frequencies
438 (given as input parameters, see parameters argument) during likelihood calculation.
439 If the selected model (model argument) does not allow for frequency optimisation, this option will be ignored.
440 If reoptimise argument is also set to True, the frequencies will only be smoothed but not reoptimised.
441 :type frequency_smoothing: bool
442 :param name_column: (optional) name of the annotation table column to be used for node names
443 in the compressed map visualisation
444 (must be one of those specified in ``columns``, if ``columns`` are specified).
445 If the annotation table contains only one column, it will be used by default.
446 :type name_column: str
447 :param root_date: (optional) date(s) of the root(s) (for dated tree(s) only),
448 if specified, used to visualise a timeline based on dates (otherwise it is based on distances to root).
449 :type root_date: str or pandas.datetime or float or list
450 :param tip_size_threshold: (optional, by default is 15) recursively remove the tips
451 of size less than threshold-th largest tip from the compressed map (set to 1e10 to keep all).
452 The larger it is the less tips will be trimmed.
453 :type tip_size_threshold: int
454 :param focus: optional way to put a focus on certain character state values,
455 so that the nodes in these states are displayed
456 even if they do not pass the trimming threshold (tip_size_threshold argument).
457 Should be in the form character:state.
458 :type focus: str or list(str)
459 :param timeline_type: (optional, by default is pastml.visualisation.cytoscape_manager.TIMELINE_SAMPLED)
460 type of timeline visualisation: at each date/distance to root selected on the slider, either
461 (pastml.visualisation.cytoscape_manager.TIMELINE_SAMPLED) all the lineages sampled after it are hidden; "
462 or (pastml.visualisation.cytoscape_manager.TIMELINE_NODES) all the nodes with a
463 more recent date/larger distance to root are hidden;
464 or (pastml.visualisation.cytoscape_manager.TIMELINE_LTT) all the nodes whose branch started
465 after this date/distance to root are hidden, and the external branches are cut
466 to the specified date/distance to root if needed;
467 :type timeline_type: str
468 :param colours: optional way to specify the colours used for character state visualisation.
469 Could be specified as
470 (1a) a dict {column: {state: colour}},
471 where column corresponds to the character for which these parameters should be used,
472 or (1b) in a form {column: path_to_colour_file};
473 or (2) as a list of paths to colour files
474 (in the same order as ``columns`` argument that specifies characters)
475 possibly given only for the first few characters;
476 or (3) as a path to colour file (only for the first character).
477 Each file should be tab-delimited, with two columns: the first one containing character states,
478 and the second, named "colour", containing colours, in HEX format (e.g. #a6cee3).
479 :type colours: str or list(str) or dict
480 :param resolve_polytomies: (default False) when True, the polytomies with a state change
481 (i.e. a parent node, P, in state A has more than 2 children, including m > 1 children, C_1, ..., C_m, in state B)
482 are resolved by grouping together same-state (different from the parent state) nodes
483 (i.e. a new internal node N in state B is created and becomes the child of P and the parent of C_1, ..., C_m).
484 :type resolve_polytomies: bool
486 :param out_data: path to the output annotation file with the reconstructed ancestral character states.
487 :type out_data: str
488 :param html_compressed: path to the output compressed visualisation file (html).
489 :type html_compressed: str
490 :param pajek: path to the output compressed visualisation file (Pajek NET Format).
491 Produced only if html_compressed is specified.
492 :type pajek: str
493 :param pajek_timing: the type of the compressed visualisation to be saved in Pajek NET Format (if pajek is specified).
494 Can be either 'VERTICAL' (default, after the nodes underwent vertical compression),
495 'HORIZONTAL' (after the nodes underwent vertical and horizontal compression)
496 or 'TRIM' (after the nodes underwent vertical and horizontal compression and minor node trimming).
497 :type pajek_timing: str
498 :param html: (optional) path to the output tree visualisation file (html).
499 :type html: str
500 :param html_mixed: (optional) path to the output mostly compressed map visualisation file (html),
501 where the nodes in states specified with the focus argument are uncompressed.
502 :type html_mixed: str
503 :param work_dir: (optional) path to the folder where pastml parameter, named tree
504 and marginal probability (for marginal ML methods (pastml.ml.MPPA, pastml.ml.MAP) only) files are to be stored.
505 Default is <path_to_input_file>/<input_file_name>_pastml. If the folder does not exist, it will be created.
506 :type work_dir: str
507 :param offline: (optional, default is False) By default (offline=False) PastML assumes
508 that there is an internet connection available,
509 which permits it to fetch CSS and JS scripts needed for visualisation online.
510 With offline=True, PastML will store all the needed CSS/JS scripts in the folder specified by work_dir,
511 so that internet connection is not needed
512 (but you must not move the output html files to any location other that the one specified by html/html_compressed.
513 :type offline: bool
515 :param verbose: (optional, default is False) print information on the progress of the analysis.
516 :type verbose: bool
518 :param threads: (optional, default is 0, which stands for automatic) number of threads PastML can use for parallesation.
519 By default, detected automatically based on the system. Note that PastML will at most use as many threads
520 as the number of characters (-c option) being analysed plus one.
521 :type threads: int
523 :param upload_to_itol: (optional, default is False) whether iTOL annotations
524 for the reconstructed characters associated with the named tree (i.e. the one found in work_dir) should be created.
525 If additionally itol_id and itol_project are specified,
526 the annotated tree will be automatically uploaded to iTOL (https://itol.embl.de/).
527 :type upload_to_itol: bool
528 :param itol_id: (optional) iTOL user batch upload ID that enables uploading to your iTOL account
529 (see https://itol.embl.de/help.cgi#batch).
530 :type itol_id: str
531 :param itol_project: (optional) iTOL project the annotated tree should be uploaded to
532 (must exist, and itol_id must be specified). If not specified, the tree will not be associated to any project.
533 :type itol_project: str
534 :param itol_tree_name: (optional) name for the tree uploaded to iTOL.
535 :type itol_tree_name: str
537 :return: void
538 """
539 logger = _set_up_pastml_logger(verbose)
540 copy_only = COPY == prediction_method or (isinstance(prediction_method, list)
541 and all(COPY == _ for _ in prediction_method))
543 roots, columns, column2states, name_column, age_label, parameters, rates = \
544 _validate_input(tree, columns, name_column if html_compressed or html_mixed else None, data, data_sep, id_index,
545 root_date if html_compressed or html or html_mixed or upload_to_itol else None,
546 copy_only=copy_only, parameters=parameters, rates=rate_matrix)
547 if not work_dir:
548 work_dir = get_pastml_work_dir(tree)
549 os.makedirs(work_dir, exist_ok=True)
551 if threads < 1:
552 threads = max(os.cpu_count(), 1)
554 acr_results = acr(forest=roots, columns=columns, column2states=column2states,
555 prediction_method=prediction_method, model=model, column2parameters=parameters,
556 column2rates=rates,
557 force_joint=forced_joint, threads=threads, reoptimise=reoptimise, tau=None if smoothing else 0,
558 resolve_polytomies=resolve_polytomies, frequency_smoothing=frequency_smoothing)
560 column2states = {acr_result[CHARACTER]: acr_result[STATES] for acr_result in acr_results}
561 if not out_data:
562 out_data = os.path.join(work_dir, get_combined_ancestral_state_file())
564 state_df = _serialize_predicted_states(sorted(column2states.keys()), out_data, roots,
565 dates_are_dates=age_label == DATE_LABEL)
567 # a meta-method would have added a suffix to the name feature
568 if html_compressed and name_column and name_column not in column2states:
569 ml_name_column = get_personalized_feature_name(name_column, get_default_ml_method())
570 name_column = ml_name_column if ml_name_column in column2states \
571 else get_personalized_feature_name(name_column, get_default_mp_method())
573 itol_result = None
574 new_tree = os.path.join(work_dir, get_named_tree_file(tree))
575 features = [DATE, DATE_CI] + list(column2states.keys())
577 clear_extra_features(roots, features)
578 nwks = '\n'.join([roots[0].write(format_root_node=True, parser=3, props=features) for root in roots])
579 with open(new_tree, 'w+') as f:
580 f.write(nwks)
581 try:
582 nexus = new_tree.replace('.nwk', '.nexus')
583 if '.nexus' not in nexus:
584 nexus = '{}.nexus'.format(nexus)
585 write(NewickIO.parse(StringIO(nwks)), nexus, 'nexus')
586 with open(nexus, 'r') as f:
587 nexus_str = f.read().replace('&&NHX:', '&')
588 for feature in features:
589 nexus_str = nexus_str.replace(':{}='.format(feature), ',{}='.format(feature))
590 with open(nexus, 'w') as f:
591 f.write(nexus_str)
592 except Exception as e:
593 logger.error(
594 'Did not manage to save the annotated tree in nexus format due to the following error: {}'.format(e))
595 pass
597 if upload_to_itol or html or html_compressed:
598 if colours:
599 if isinstance(colours, str):
600 colours = [colours]
601 if isinstance(colours, list):
602 colours = dict(zip(columns, colours))
603 elif isinstance(colours, dict):
604 colours = {col_name2cat(col): cls for (col, cls) in colours.items()}
605 else:
606 raise ValueError('Colours should be either a list or a dict, got {}.'.format(type(colours)))
607 else:
608 colours = {}
610 if threads > 1:
611 pool = ThreadPool(processes=threads - 1)
612 async_result = pool.map_async(func=_serialize_acr, iterable=((acr_res, work_dir) for acr_res in acr_results))
613 if upload_to_itol:
614 if DATE_LABEL == age_label:
615 try:
616 dates = state_df[DATE].apply(lambda _: numeric2datetime(_).strftime("%d %b %Y"))
617 state_df[DATE] = dates
618 except:
619 pass
620 itol_result = pool.apply_async(func=generate_itol_annotations,
621 args=(column2states, work_dir, acr_results, state_df, age_label,
622 new_tree, itol_id, itol_project, itol_tree_name, colours))
623 else:
624 for acr_res in acr_results:
625 _serialize_acr((acr_res, work_dir))
626 if upload_to_itol:
627 if DATE_LABEL == age_label:
628 try:
629 dates = state_df[DATE].apply(lambda _: numeric2datetime(_).strftime("%d %b %Y"))
630 state_df[DATE] = dates
631 except:
632 pass
633 generate_itol_annotations(column2states, work_dir, acr_results, state_df, age_label,
634 new_tree, itol_id, itol_project, itol_tree_name, colours)
636 if html or html_compressed or html_mixed:
637 logger.debug('\n=============VISUALISATION=====================')
639 if (html_compressed or html_mixed) and focus:
640 def parse_col_val(cv):
641 cv = str(cv).strip()
642 colon_pos = cv.find(':')
643 if colon_pos == -1:
644 if len(column2states) == 1 and cv in next(iter(column2states.values())):
645 return next(iter(column2states.keys())), cv
646 else:
647 raise ValueError('Focus values should be in a form character:state, got {} instead.'.format(cv))
648 col, state = col_name2cat(cv[:colon_pos]), cv[colon_pos + 1:]
649 if col not in column2states:
650 ml_col = get_personalized_feature_name(col, get_default_ml_method())
651 if ml_col in column2states:
652 col = ml_col
653 else:
654 mp_col = get_personalized_feature_name(col, get_default_mp_method())
655 if mp_col in column2states:
656 col = mp_col
657 else:
658 raise ValueError('Character {} specified for focus values is not found in metadata.'.format(
659 cv[:colon_pos]))
660 if state not in column2states[col]:
661 raise ValueError(
662 'Character {} state {} not found among possible states in metadata.'.format(cv[:colon_pos],
663 state))
664 return col, state
666 if isinstance(focus, str):
667 focus = list(focus)
668 if not isinstance(focus, list):
669 raise ValueError(
670 'Focus argument should be either a string or a list of strings, got {} instead.'.format(
671 type(focus)))
672 focus_cv = [parse_col_val(_) for _ in focus]
673 focus = defaultdict(set)
674 for c, v in focus_cv:
675 focus[c].add(v)
677 visualize(roots, column2states=column2states, html=html, html_compressed=html_compressed, html_mixed=html_mixed,
678 name_column=name_column, tip_size_threshold=tip_size_threshold, date_label=age_label,
679 timeline_type=timeline_type, work_dir=work_dir, local_css_js=offline, column2colours=colours,
680 focus=focus, pajek=pajek, pajek_timing=pajek_timing)
682 if threads > 1:
683 async_result.wait()
684 if upload_to_itol:
685 itol_result.wait()
686 pool.close()
689def parse_date(d):
690 try:
691 return float(d)
692 except ValueError:
693 try:
694 return datetime2numeric(pd.to_datetime(d, infer_datetime_format=True))
695 except ValueError:
696 raise ValueError('Could not infer the date format for root date "{}", please check it.'
697 .format(d))
700def _validate_input(tree_nwk, columns=None, name_column=None, data=None, data_sep='\t', id_index=0,
701 root_dates=None, copy_only=False, parameters=None, rates=None):
702 logger = logging.getLogger('pastml')
703 logger.debug('\n=============INPUT DATA VALIDATION=============')
705 if not columns and data is None:
706 raise ValueError("If you don't provide the metadata file, "
707 "you need to provide an annotated tree and specify the columns argument, "
708 "which will be used to look for character annotations in your input tree.")
710 if columns and isinstance(columns, str):
711 columns = [columns]
713 roots = read_forest(tree_nwk, columns=columns if data is None else None)
714 num_neg = 0
715 for root in roots:
716 for _ in root.traverse():
717 if _.dist < 0:
718 num_neg += 1
719 _.dist = 0
720 if num_neg:
721 logger.warning('Input tree{} contained {} negative branches: we put them to zero.'
722 .format('s' if len(roots) > 0 else '', num_neg))
723 logger.debug('Read the tree{} {}.'.format('s' if len(roots) > 0 else '', tree_nwk))
725 column2annotated = Counter()
726 column2states = defaultdict(set)
728 if data:
729 df = pd.read_csv(data, sep=data_sep, index_col=id_index, header=0, dtype=str)
730 df.index = df.index.map(str)
731 logger.debug('Read the annotation file {}.'.format(data))
732 if columns:
733 unknown_columns = set(columns) - set(df.columns)
734 if unknown_columns:
735 raise ValueError('{} of the specified columns ({}) {} not found among the annotation columns: {}.'
736 .format('One' if len(unknown_columns) == 1 else 'Some',
737 _quote(unknown_columns),
738 'is' if len(unknown_columns) == 1 else 'are',
739 _quote(df.columns)))
740 df = df[columns]
741 df.columns = [col_name2cat(column) for column in df.columns]
742 if name_column:
743 name_column = col_name2cat(name_column)
744 columns = df.columns
746 node_names = set.union(*[{n.name for n in root.traverse() if n.name} for root in roots])
747 df_index_names = set(df.index)
748 common_ids = node_names & df_index_names
750 # strip quotes if needed
751 if not common_ids:
752 node_names = {_.strip("'").strip('"') for _ in node_names}
753 common_ids = node_names & df_index_names
754 if common_ids:
755 for root in roots:
756 for n in root.traverse():
757 n.name = n.name.strip("'").strip('"')
759 filtered_df = df.loc[list(common_ids), :]
760 if not filtered_df.shape[0]:
761 tip_name_representatives = []
762 for _ in roots[0].iter_leaves():
763 if len(tip_name_representatives) < 3:
764 tip_name_representatives.append(_.name)
765 else:
766 break
767 raise ValueError(
768 'Your tree tip names (e.g. {}) do not correspond to annotation id column values (e.g. {}). '
769 'Check your annotation file.'
770 .format(', '.join(tip_name_representatives),
771 ', '.join(list(df_index_names)[: min(len(df_index_names), 3)])))
772 logger.debug('Checked that (at least some of) tip names correspond to annotation file index.')
773 preannotate_forest(roots, df=df)
774 for c in df.columns:
775 column2states[c] |= {_ for _ in df[c].unique() if pd.notnull(_) and _ != ''}
777 num_tips = 0
779 column2annotated_states = defaultdict(set)
780 for root in roots:
781 for n in root.traverse():
782 for c in columns:
783 vs = n.props.get(c, set())
785 column2states[c] |= vs
786 column2annotated_states[c] |= vs
787 if vs:
788 column2annotated[c] += 1
789 if n.is_leaf:
790 num_tips += 1
792 if column2annotated:
793 c, num_annotated = min(column2annotated.items(), key=lambda _: _[1])
794 else:
795 c, num_annotated = columns[0], 0
796 percentage_unknown = (num_tips - num_annotated) / num_tips
798 if percentage_unknown >= (.9 if not copy_only else 1):
799 raise ValueError('{:.1f}% of tip annotations for character "{}" are unknown, '
800 'not enough data to infer ancestral states. '
801 '{}'
802 .format(percentage_unknown * 100, c,
803 'Check your annotation file and if its ids correspond to the tree tip/node names.'
804 if data
805 else 'You tree file should contain character state annotations, '
806 'otherwise consider specifying a metadata file.'))
808 c, states = min(column2annotated_states.items(), key=lambda _: len(_[1]))
809 if len(states) > num_tips * .75 and not copy_only:
810 raise ValueError('Character "{}" has {} unique states annotated in this tree: {}, '
811 'which is too much to infer on a {} with only {} tips. '
812 'Make sure the character you are analysing is discrete, and if yes use a larger tree.'
813 .format(c, len(states), states, 'tree' if len(roots) == 1 else 'forest', num_tips))
815 if name_column and name_column not in columns:
816 raise ValueError('The name column ("{}") should be one of those specified as columns ({}).'
817 .format(name_column, _quote(columns)))
818 elif len(columns) == 1:
819 name_column = columns[0]
821 # Process root dates
822 if root_dates is not None:
823 root_dates = [parse_date(d) for d in (root_dates if isinstance(root_dates, list) else [root_dates])]
824 if 1 < len(root_dates) < len(roots):
825 raise ValueError('{} trees are given, but only {} root dates.'.format(len(roots), len(root_dates)))
826 elif 1 == len(root_dates):
827 root_dates *= len(roots)
828 age_label = DIST_TO_ROOT_LABEL \
829 if (root_dates is None and not next((True for root in roots if root.props.get(DATE, None) is not None), False)) \
830 else DATE_LABEL
831 annotate_dates(roots, root_dates=root_dates)
832 logger.debug('Finished input validation.')
834 column2states = {c: np.array(sorted(states)) for c, states in column2states.items()}
836 if parameters:
837 if isinstance(parameters, str):
838 parameters = [parameters]
839 if isinstance(parameters, list):
840 parameters = dict(zip(columns, parameters))
841 elif isinstance(parameters, dict):
842 parameters = {col_name2cat(col): params for (col, params) in parameters.items()}
843 else:
844 raise ValueError('Parameters should be either a list or a dict, got {}.'.format(type(parameters)))
845 else:
846 parameters = {}
848 if rates:
849 if isinstance(rates, str):
850 rates = [rates]
851 if isinstance(rates, list):
852 rates = dict(zip(columns, rates))
853 elif isinstance(rates, dict):
854 rates = {col_name2cat(col): rs for (col, rs) in rates.items()}
855 else:
856 raise ValueError('Rate matrices should be either a list or a dict, got {}.'.format(type(rates)))
857 else:
858 rates = {}
860 for i, tree in enumerate(roots):
861 name_tree(tree, suffix='' if len(roots) == 1 else '_{}'.format(i))
863 return roots, columns, column2states, name_column, age_label, parameters, rates
866def _serialize_predicted_states(columns, out_data, roots, dates_are_dates=True):
867 ids, data = [], []
868 # Not using DataFrames to speed up document writing
869 with open(out_data, 'w+') as f:
870 f.write('node\t{}\n'.format('\t'.join(columns)))
871 for root in roots:
872 for node in root.traverse():
873 vs = [node.dist, get_formatted_date(node, dates_are_dates)]
874 column2values = {}
875 for column in columns:
876 value = node.props.get(column, set())
877 vs.append(value)
878 if value:
879 column2values[column] = sorted(value, reverse=True)
880 data.append(vs)
881 ids.append(node.name)
882 while column2values:
883 f.write('{}'.format(node.name))
884 for column in columns:
885 if column in column2values:
886 values = column2values[column]
887 value = values.pop()
888 if not values:
889 del column2values[column]
890 else:
891 value = ''
892 f.write('\t{}'.format(value))
893 f.write('\n')
894 logging.getLogger('pastml').debug('Serialized reconstructed states to {}.'.format(out_data))
895 return pd.DataFrame(index=ids, data=data, columns=['dist', DATE] + columns)
898def _set_up_pastml_logger(verbose):
899 logger = logging.getLogger('pastml')
900 logger.setLevel(level=logging.DEBUG if verbose else logging.ERROR)
901 logger.propagate = False
902 if not logger.hasHandlers():
903 ch = logging.StreamHandler()
904 formatter = logging.Formatter('%(name)s:%(levelname)s:%(asctime)s %(message)s', datefmt="%H:%M:%S")
905 ch.setFormatter(formatter)
906 logger.addHandler(ch)
907 return logger
910def main():
911 """
912 Entry point, calling :py:func:`pastml.acr.pastml_pipeline` with command-line arguments.
914 :return: void
915 """
916 import argparse
918 parser = argparse.ArgumentParser(description="Ancestral character reconstruction and visualisation "
919 "for rooted phylogenetic trees.", prog='pastml')
921 tree_group = parser.add_argument_group('tree-related arguments')
922 tree_group.add_argument('-t', '--tree', help="input tree(s) in newick format (must be rooted).",
923 type=str, required=True)
925 annotation_group = parser.add_argument_group('annotation-file-related arguments')
926 annotation_group.add_argument('-d', '--data', required=False, type=str, default=None,
927 help="annotation file in tab/csv format with the first row "
928 "containing the column names. "
929 "If not given, the annotations should be contained in the tree file itself.")
930 annotation_group.add_argument('-s', '--data_sep', required=False, type=str, default='\t',
931 help="column separator for the annotation table. "
932 "By default is set to tab, i.e. for a tab-delimited file. "
933 "Set it to ',' if your file is csv.")
934 annotation_group.add_argument('-i', '--id_index', required=False, type=int, default=0,
935 help="index of the annotation table column containing tree tip names, "
936 "indices start from zero (by default is set to 0).")
938 acr_group = parser.add_argument_group('ancestral-character-reconstruction-related arguments')
939 acr_group.add_argument('-c', '--columns', nargs='*',
940 help="names of the annotation table columns that contain characters "
941 "to be analysed. "
942 "If not specified, all columns are considered.",
943 type=str)
944 acr_group.add_argument('--prediction_method',
945 choices=[MPPA, MAP, JOINT, DOWNPASS, ACCTRAN, DELTRAN, COPY, ALL, ML, MP],
946 type=str, nargs='*', default=MPPA,
947 help='ancestral character reconstruction (ACR) method, '
948 'can be one of the max likelihood (ML) methods: {ml}, '
949 'one of the max parsimony (MP) methods: {mp}; '
950 'or {copy} to keep the annotated character states as-is without inference. '
951 'One can also specify one of the meta-methods {meta} that would perform ACR '
952 'with multiple methods (all of them for {meta_all}, '
953 'all the ML methods for {meta_ml}, or all the MP methods for {meta_mp}) '
954 'and save/visualise the results as multiple characters '
955 'suffixed with the corresponding method.'
956 'When multiple ancestral characters are specified (see -c, --columns), '
957 'the same method can be used for all of them (if only one method is specified), '
958 'or different methods can be used (specified in the same order as -c, --columns). '
959 'If multiple methods are given, but not for all the characters, '
960 'for the rest of them the default method ({default}) is chosen.'
961 .format(ml=', '.join(ML_METHODS), mp=', '.join(MP_METHODS), copy=COPY, default=MPPA,
962 meta=', '.join(META_ML_METHODS | {MP}), meta_ml=ML, meta_mp=MP, meta_all=ALL))
963 acr_group.add_argument('--forced_joint', action='store_true',
964 help='add {joint} state to the {mppa} state selection '
965 'even if it is not selected by Brier score.'.format(joint=JOINT, mppa=MPPA))
966 acr_group.add_argument('-m', '--model', default=F81,
967 choices=[JC, F81, EFT, HKY, JTT, CUSTOM_RATES],
968 type=str, nargs='*',
969 help='evolutionary model for ML methods (ignored by MP methods). '
970 'When multiple ancestral characters are specified (see -c, --columns), '
971 'the same model can be used for all of them (if only one model is specified), '
972 'or different models can be used (specified in the same order as -c, --columns). '
973 'If multiple models are given, but not for all the characters, '
974 'for the rest of them the default model ({}) is chosen.'.format(F81))
975 acr_group.add_argument('--parameters', type=str, nargs='*',
976 help='optional way to fix some of the ML-method parameters '
977 'by specifying files that contain them. '
978 'Should be in the same order '
979 'as the ancestral characters (see -c, --columns) '
980 'for which the reconstruction is to be preformed. '
981 'Could be given only for the first few characters. '
982 'Each file should be tab-delimited, with two columns: '
983 'the first one containing parameter names, '
984 'and the second, named "value", containing parameter values. '
985 'Parameters can include character state frequencies '
986 '(parameter name should be the corresponding state, '
987 'and parameter value - the float frequency value, between 0 and 1),'
988 'tree branch scaling factor (parameter name {}),'.format(SCALING_FACTOR) +
989 'and tree branch smoothing factor (parameter name {}),'.format(SMOOTHING_FACTOR))
990 acr_group.add_argument('--rate_matrix', type=str, nargs='*',
991 help='(only for {} model) path to the file(s) containing the rate matrix(ces). '
992 'Should be in the same order '
993 'as the ancestral characters (see -c, --columns) '
994 'for which the reconstruction is to be preformed. '
995 'Could be given only for the first few characters. '
996 'The rate matrix file should specify character states in its first line, '
997 'preceded by # and separated by spaces. '
998 'The following lines should contain a symmetric squared rate matrix with positive rates'
999 '(and zeros on the diagonal), separated by spaces, '
1000 'in the same order at the character states specified in the first line.'
1001 'For example, for four states, A, C, G, T '
1002 'and the rates A<->C 1, A<->G 4, A<->T 1, C<->G 1, C<->T 4, G<->T 1,'
1003 'the rate matrix file would look like:\n'
1004 '# A C G T\n'
1005 '0 1 4 1\n'
1006 '1 0 1 4\n'
1007 '4 1 0 1\n'
1008 '1 4 1 0'.format(CUSTOM_RATES))
1009 acr_group.add_argument('--reoptimise', action='store_true',
1010 help='if the parameters are specified, they will be considered as an optimisation '
1011 'starting point instead and optimised.')
1012 acr_group.add_argument('--smoothing', action='store_true',
1013 help='Apply a smoothing factor (optimised) to branch lengths during likelihood calculation.')
1014 acr_group.add_argument('--frequency_smoothing', action='store_true',
1015 help='Apply a smoothing factor (optimised) to state frequencies '
1016 '(given as input parameters, see --parameters) '
1017 'during likelihood calculation. '
1018 'If the selected model (--model) does not allow for frequency optimisation,'
1019 ' this option will be ignored. '
1020 'If --reoptimise is also specified, '
1021 'the frequencies will only be smoothed but not reoptimised. ')
1023 vis_group = parser.add_argument_group('visualisation-related arguments')
1024 vis_group.add_argument('-n', '--name_column', type=str, default=None,
1025 help="name of the character to be used for node names "
1026 "in the compressed map visualisation "
1027 "(must be one of those specified via -c, --columns). "
1028 "If the annotation table contains only one column it will be used by default.")
1029 vis_group.add_argument('--root_date', required=False, default=None,
1030 help="date(s) of the root(s) (for dated tree(s) only), "
1031 "if specified, used to visualise a timeline based on dates "
1032 "(otherwise it is based on distances to root).",
1033 type=str, nargs='*')
1034 vis_group.add_argument('--tip_size_threshold', type=int, default=REASONABLE_NUMBER_OF_TIPS,
1035 help="recursively remove the tips of size less than threshold-th largest tip"
1036 "from the compressed map (set to 1e10 to keep all tips). "
1037 "The larger it is the less tips will be trimmed.")
1038 vis_group.add_argument('--timeline_type', type=str, default=TIMELINE_SAMPLED,
1039 help="type of timeline visualisation: at each date/distance to root selected on the slider "
1040 "either ({sampled}) - all the lineages sampled after it are hidden; "
1041 "or ({nodes}) - all the nodes with a more recent date/larger distance to root are hidden; "
1042 "or ({ltt}) - all the nodes whose branch started after this date/distance to root "
1043 "are hidden, and the external branches are cut to the specified date/distance to root "
1044 "if needed;".format(sampled=TIMELINE_SAMPLED, ltt=TIMELINE_LTT, nodes=TIMELINE_NODES),
1045 choices=[TIMELINE_SAMPLED, TIMELINE_NODES, TIMELINE_LTT])
1046 vis_group.add_argument('--offline', action='store_true',
1047 help="By default (without --offline option) PastML assumes "
1048 "that there is an internet connection available, "
1049 "which permits it to fetch CSS and JS scripts needed for visualisation online."
1050 "With --offline option turned on, PastML will store all the needed CSS/JS scripts "
1051 "in the folder specified by --work_dir, so that internet connection is not needed "
1052 "(but you must not move the output html files to any location "
1053 "other that the one specified by --html/--html_compressed).")
1054 vis_group.add_argument('--colours', type=str, nargs='*',
1055 help='optional way to specify the colours used for character state visualisation. '
1056 'Should be in the same order '
1057 'as the ancestral characters (see -c, --columns) '
1058 'for which the reconstruction is to be preformed. '
1059 'Could be given only for the first few characters. '
1060 'Each file should be tab-delimited, with two columns: '
1061 'the first one containing character states, '
1062 'and the second, named "colour", containing colours, in HEX format (e.g. #a6cee3).')
1063 vis_group.add_argument('--focus', type=str, nargs='*',
1064 help='optional way to put a focus on certain character state values, '
1065 'so that the nodes in these states are displayed '
1066 'even if they do not pass the trimming threshold (--tip_size_threshold). '
1067 'Should be in the form character:state.')
1068 vis_group.add_argument('--resolve_polytomies', action='store_true',
1069 help='When specified, the polytomies with a state change '
1070 '(i.e. a parent node, P, in state A has more than 2 children, '
1071 'including m > 1 children, C_1, ..., C_m, in state B) are resolved '
1072 'by grouping together same-state (different from the parent state) nodes '
1073 '(i.e. a new internal node N in state B is created and becomes the child of P '
1074 'and the parent of C_1, ..., C_m).')
1076 out_group = parser.add_argument_group('output-related arguments')
1077 out_group.add_argument('-o', '--out_data', required=False, type=str,
1078 help="path to the output annotation file with the reconstructed ancestral character states.")
1079 out_group.add_argument('--work_dir', required=False, default=None, type=str,
1080 help="path to the folder where pastml parameter, named tree "
1081 "and marginal probability (for marginal ML methods ({}) only) files are to be stored. "
1082 "Default is <path_to_input_file>/<input_file_name>_pastml. "
1083 "If the folder does not exist, it will be created."
1084 .format(', '.join(MARGINAL_ML_METHODS)))
1085 out_group.add_argument('--html_compressed', required=False, default=None, type=str,
1086 help="path to the output compressed map visualisation file (html).")
1087 out_group.add_argument('--pajek', required=False, default=None, type=str,
1088 help="path to the output vertically compressed visualisation file (Pajek NET Format). "
1089 "Prooduced only if --html_compressed is specified.")
1090 out_group.add_argument('--pajek_timing', required=False, default=VERTICAL, choices=(VERTICAL, HORIZONTAL, TRIM),
1091 type=str,
1092 help="the type of the compressed visualisation to be saved in Pajek NET Format "
1093 "(if --pajek is specified). "
1094 "Can be either {} (default, after the nodes underwent vertical compression), "
1095 "{} (after the nodes underwent vertical and horizontal compression) "
1096 "or {} (after the nodes underwent vertical and horizontal compression"
1097 " and minor node trimming)".format(VERTICAL, HORIZONTAL, TRIM))
1098 out_group.add_argument('--html', required=False, default=None, type=str,
1099 help="path to the output full tree visualisation file (html).")
1100 out_group.add_argument('--html_mixed', required=False, default=None, type=str,
1101 help="path to the output mostly compressed map visualisation file (html), "
1102 "where the nodes in states specified with --focus are uncompressed.")
1103 out_group.add_argument('-v', '--verbose', action='store_true',
1104 help="print information on the progress of the analysis (to console)")
1106 parser.add_argument('--version', action='version', version='%(prog)s {version}'.format(version=PASTML_VERSION))
1108 parser.add_argument('--threads', required=False, default=0, type=int,
1109 help="Number of threads PastML can use for parallesation. "
1110 "By default detected automatically based on the system. "
1111 "Note that PastML will at most use as many threads "
1112 "as the number of characters (-c option) being analysed plus one.")
1114 itol_group = parser.add_argument_group('iTOL-related arguments')
1115 itol_group.add_argument('--upload_to_itol', action='store_true',
1116 help="create iTOL annotations for the reconstructed characters "
1117 "associated with the named tree (i.e. the one found in --work_dir). "
1118 "If additionally --itol_id and --itol_project are specified, "
1119 "the annotated tree will be automatically uploaded to iTOL (https://itol.embl.de/).")
1120 itol_group.add_argument('--itol_id', required=False, default=None, type=str,
1121 help="iTOL user batch upload ID that enables uploading to your iTOL account "
1122 "(see https://itol.embl.de/help.cgi#batch).")
1123 itol_group.add_argument('--itol_project', required=False, default="Sample project", type=str,
1124 help="iTOL project the annotated tree should be associated with "
1125 "(must exist, and --itol_id must be specified). By default set to 'Sample project'.")
1126 itol_group.add_argument('--itol_tree_name', required=False, default=None, type=str,
1127 help="name for the tree uploaded to iTOL.")
1129 params = parser.parse_args()
1131 pastml_pipeline(**vars(params))
1134if '__main__' == __name__:
1135 main()