Source code for snpio.analysis.tree_parser

import os
from pathlib import Path
from typing import Any, Dict, List, Optional

import numpy as np
import pandas as pd
import toytree as tt

from snpio.read_input.genotype_data import GenotypeData


[docs] class TreeParser(GenotypeData): """TreeParser class for reading and manipulating phylogenetic trees. This class provides methods for reading, writing, and manipulating phylogenetic trees. The TreeParser class inherits from the GenotypeData class and provides additional functionality for working with phylogenetic trees. The TreeParser class can read phylogenetic trees from Newick or NEXUS format files, calculate basic statistics for the tree, extract subtrees, prune the tree, reroot the tree, and calculate pairwise distance matrices. Example: >>> tp = TreeParser( ... genotype_data=gd_filt, ... treefile="snpio/example_data/trees/test.tre", ... qmatrix="snpio/example_data/trees/test.iqtree", ... siterates="snpio/example_data/trees/test14K.rate", ... show_plots=True, ... verbose=True, ... debug=False, ... ) >>> >>> tree = tp.read_tree() >>> print(tp.tree_stats()) >>> tp.reroot_tree("~EA") >>> print(tp.get_distance_matrix()) >>> print(tp.qmat) >>> print(tp.site_rates) >>> subtree = tp.get_subtree("~EA") >>> pruned_tree = tp.prune_tree("~ON") >>> print(tp.write_tree(subtree, save_path=None)) >>> print(tp.write_tree(pruned_tree, save_path=None) Attributes: genotype_data (GenotypeData): GenotypeData object containing the SNP data. treefile (str): Path to the phylogenetic tree file. qmatrix (str): Path to the Q matrix file. siterates (str): Path to the site rates file. verbose (bool): Whether to display verbose output. debug (bool): Whether to display debug output. """ def __init__( self, genotype_data: Any, treefile: str, qmatrix: Optional[str] = None, siterates: Optional[str] = None, verbose: bool = False, debug: bool = False, ) -> None: """Initialize the TreeParser object. This class provides methods for reading, writing, and manipulating phylogenetic trees. The TreeParser class inherits from the GenotypeData class and provides additional functionality for working with phylogenetic trees. The TreeParser class can read phylogenetic trees from Newick or NEXUS format files, calculate basic statistics for the tree, extract subtrees, prune the tree, reroot the tree, and calculate pairwise distance matrices. Args: genotype_data (Any): GenotypeData object containing the SNP data. treefile (str): Path to the phylogenetic tree file. qmatrix (str, optional): Path to the Q matrix file. Defaults to None. siterates (str, optional): Path to the site rates file. Defaults to None. verbose (bool, optional): Whether to display verbose output. Defaults to False. debug (bool, optional): Whether to display debug output. Defaults to False. """ # Initialize the parent class GenotypeData super().__init__( filename=genotype_data.filename, filetype="tree", popmapfile=genotype_data.popmapfile, force_popmap=genotype_data.force_popmap, exclude_pops=genotype_data.exclude_pops, include_pops=genotype_data.include_pops, plot_format=genotype_data.plot_format, prefix=genotype_data.prefix, verbose=genotype_data.verbose, debug=genotype_data.debug, plot_fontsize=genotype_data.plot_fontsize, plot_dpi=genotype_data.plot_dpi, loci_indices=genotype_data.loci_indices, sample_indices=genotype_data.sample_indices, ) self.logger = genotype_data.logger self.genotype_data = genotype_data self.treefile = treefile self.qmatrix = qmatrix self.siterates = siterates self.qmatrix self.verbose = verbose self.debug = debug self._tree = None self._qmat = None self._site_rates = None
[docs] def read_tree(self) -> tt.ToyTree: """Read Newick or NEXUS-style phylogenetic tree into toytree object. This method reads a phylogenetic tree from a file and returns it as a toytree object. The tree file can be in Newick or NEXUS format. If the tree file is not found or is unreadable, an exception is raised. Returns: toytree.tree object: The input tree as a toytree object. Raises: FileNotFoundError: If the tree file is not found. PermissionError: If the tree file exists but is not readable. """ if not Path(self.treefile).is_file(): raise FileNotFoundError(f"File {self.treefile} not found!") if not os.access(self.treefile, os.R_OK): msg = f"Tree file {self.treefile} is unreadable." self.logger.error(msg) raise PermissionError(msg) return tt.tree(self.treefile)
[docs] def write_tree( self, tree: tt.ToyTree, save_path: Optional[str] = None, nexus: bool = False ) -> Optional[str]: """Write the phylogenetic tree to a file. This method saves the phylogenetic tree to a file in Newick or NEXUS format. If the save_path argument is not provided, the tree is returned as a string representation. Args: tree (toytree.tree): The tree object to save. save_path (str, optional): Path to save the tree file. If not provided (left as None), then a string representation of the tree is returned. Defaults to None. nexus (bool, optional): Whether to save the tree in NEXUS format.If False, then Newick format is used. Defaults to False. Returns: Optional[str]: The string representation of the tree if save_path is None. Otherwise, None is returned. Raises: TypeError: If the input tree is not a toytree object. """ if not isinstance(tree, tt.ToyTree): msg = f"Input tree must be a toytree object, but got: {type(tree)}." self.logger.error(msg) raise TypeError(msg) tree_str = tree.write(path=save_path) if tree_str is not None: return tree_str self.logger.info(f"Tree saved to {save_path}")
[docs] def tree_stats(self) -> Dict[str, Any]: """Calculate basic statistics for the phylogenetic tree. Returns: Dict[str, Any]: Dictionary containing tree statistics such as the number of tips, number of nodes, and total tree height. """ if self._tree is None: self._tree = self.read_tree() stats = { "num_tips": self._tree.ntips, "num_nodes": self._tree.nnodes, "max_tree_height": self._tree.get_node_data()["height"].max(), "tree_length": self._tree.get_node_data()["height"].sum(), "is_rooted": self._tree.is_rooted(), "is_bifurcating": self._tree.is_bifurcating(), } self.logger.debug(f"Tree statistics: {stats}") return stats
[docs] def get_subtree(self, regex: str) -> tt.ToyTree: """Get a subtree rooted at a specified node or tip. This method extracts a subtree from the phylogenetic tree rooted at the specified node or tip. The subtree is returned as a toytree object. The regex argument can be a regular expression to match the node or tip name. Regular expressions can be prefixed with '~' to indicate taxa to keep. Args: regex (int): Regular expression to match the node or tip name. Regular expressions can be prefixed with '~' to indicate taxa to keep. Returns: toytree.tree: The subtree rooted at the specified node. """ if self._tree is None: self._tree = self.read_tree() subtree = self._tree.mod.extract_subtree(regex) self.logger.debug(f"Subtree with any tips labeled {regex} obtained.") return subtree
[docs] def prune_tree(self, taxa: List[str] | str) -> tt.ToyTree: """Prune the tree by removing a set of taxa (leaf nodes). This method prunes the tree by removing a set of taxa (leaf nodes) from the tree. The taxa argument can be a list of taxa names to remove from the tree or a regular expression to match the node or tip name. Regular expressions can be prefixed with '~' to indicate taxa to keep. Args: taxa (Union[List[str], str]): List of taxa names to remove from the tree or a regular expression to match the node or tip name. Regular expressions can be prefixed with '~' to indicate taxa to keep. Returns: toytree.tree: The pruned tree object. """ if self._tree is None: self._tree = self.read_tree() if isinstance(taxa, list): pruned_tree = self._tree.mod.drop_tips(*taxa) else: pruned_tree = self._tree.mod.drop_tips(taxa) self.logger.debug(f"Pruned tree by removing taxa: {taxa}") return pruned_tree
[docs] def get_distance_matrix(self) -> pd.DataFrame: """Calculate the pairwise distance matrix between all tips in the tree. This method computes the pairwise distance matrix between all nodes and tips in the phylogenetic tree. The distance matrix is returned as a pandas DataFrame object. Returns: pd.DataFrame: Pairwise distance matrix as a pandas DataFrame. """ if self._tree is None: self._tree = self.read_tree() dist_df = self._tree.distance.get_node_distance_matrix(df=True) self.logger.debug("Computed pairwise distance matrix for the tree.") return dist_df
[docs] def reroot_tree(self, node: int | str | List[str]) -> tt.ToyTree: """Reroot the tree at a specific node or tip. This method reroots the tree at a specific node or tip, changing the root of the tree to the specified node. The rerooted tree is returned as a toytree object. Args: node (Union[int, str]): Index of the node or tip where the tree should be rerooted, a regex string to match the node or tip name prefixed by "~", or a list of node or tip names. Returns: toytree.tree: The rerooted tree. """ if self._tree is None: self._tree = self.read_tree() is_list = isinstance(node, list) tree = self._tree mrca = tree.get_mrca_node(*node) if is_list else tree.get_mrca_node(node) rerooted_tree = self._tree.root(mrca) self.logger.info(f"Tree rerooted at node {node}.") return rerooted_tree
[docs] def load_tree_from_string(self, newick_str: str) -> tt.ToyTree: """Load a phylogenetic tree from a Newick string. This method loads a phylogenetic tree from a Newick string and returns it as a toytree object. Args: newick_str (str): The Newick string representing the tree. Returns: toytree.tree: The loaded tree object. """ tree = tt.tree(newick_str) self.logger.debug("Loaded tree from Newick string.") return tree
def _q_from_file(self) -> pd.DataFrame: """Read Q matrix from a file. This method reads the Q matrix from a file and returns it as a pandas DataFrame object. The Q matrix file can be in either comma-separated or whitespace-separated format. Returns: pandas.DataFrame: The Q-matrix as a pandas.DataFrame object. Raises: FileNotFoundError: If the Q matrix file is not found. """ if not Path(self.qmatrix).is_file(): raise FileNotFoundError(f"File {self.qmatrix} not found!") with open(self.qmatrix, "r") as fin: lines = fin.readlines() header = True if "A" in lines[0].upper() else False # Check if the file is comma or whitespace separated sep = r"," if r"," in lines[1] else r"\s+" header_idx = 0 if header else None # Read the Q matrix file using pandas. dfq = pd.read_csv( self.qmatrix, sep=sep, header=header_idx, names=["A", "C", "G", "T"] ) if header: dfq = dfq.set_index(dfq.columns) else: nucs = ["A", "C", "G", "T"] dfq.columns = nucs dfq.index = nucs dfq = dfq.astype(float) self.logger.debug(f"{dfq=}") return dfq def _q_from_iqtree(self) -> pd.DataFrame: """Read Q matrix from an IQ-TREE (.iqtree) file. The IQ-TREE file contains the standard output of an IQ-TREE run and includes the Q-matrix. This method reads the Q matrix from the IQ-TREE file and returns it as a pandas DataFrame object. The IQ-TREE file should contain the rate matrix Q in the format: ``` Rate matrix Q A C G T -0.000000 0.000000 0.000000 0.000000 0.000000 -0.000000 0.000000 0.000000 0.000000 0.000000 -0.000000 0.000000 0.000000 0.000000 0.000000 -0.000000 ``` The header row and index column are optional and can be omitted. The Q matrix values and header should be separated by whitespace or commas, and the matrix should be square with the columns and index in the order A, C, G, T. Args: iqfile (str): Path to the IQ-TREE file (.iqtree). Returns: pandas.DataFrame: The Q-matrix as a pandas DataFrame. Raises: FileNotFoundError: If the IQ-TREE file could not be found. IOError: If the IQ-TREE file could not be read. """ qlines = [] with open(self.qmatrix, "r") as fin: foundLine = False matlinecount = 0 for line in fin: line = line.strip() if not line: continue if "rate matrix q" in line.lower(): foundLine = True continue if foundLine: matlinecount += 1 if matlinecount > 4: break qlines.append(line) # Check that the Q matrix was found and read if not qlines: raise IOError(f"Rate matrix Q not found in IQ-TREE file {self.qmatrix}") # Populate q matrix with values from the IQ-TREE file qlines = [line.split(",") if "," in line else line.split() for line in qlines] dfq = pd.DataFrame(qlines, columns=["nuc", "A", "C", "G", "T"]) dfq = dfq.set_index("nuc") dfq = dfq.astype(float) self.logger.debug(f"{dfq=}") return dfq def _siterates_from_iqtree(self) -> pd.DataFrame: """Read site-specific substitution rates from .rates file. The rates file is an optional output file generated by IQ-TREE (.rate) and contains a table of site-specific rates and rate categories. This method reads the site rates from the IQ-TREE file and returns them as a list of float values. The rates file should contain the site rates in the format: ``` # Any comment lines can be included here. Site Rate Cat C_rate 1 0.0000 1 0.0000 2 0.0000 1 0.0000 3 0.0000 1 0.0000 4 0.0000 1 0.0000 5 0.0000 1 0.0000 ``` The site rates should be in the 'Rate' column and separated by whitespace or commas. Returns: List[float]: List of site-specific substitution rates. Raises: FileNotFoundError: If the rates file could not be found. """ if not Path(self.siterates).is_file(): self.logger.error(f"File {self.siterates} not found.") raise FileNotFoundError(f"File {self.siterates} not found.") try: dfs = pd.read_csv(self.siterates, sep=r"\s+", comment="#") except IOError as e: msg = f"Could not read rates file {self.siterates}: {e}" self.logger.error(msg) raise return dfs["Rate"].to_list() def _validate_rates(self, rates: pd.DataFrame) -> None: """Validate the number of site rates matches the number of SNPs. This method validates the number of site rates matches the number of SNPs in the alignment. If the number of site rates does not match the number of SNPs, a ValueError is raised. Args: rates (pd.DataFrame): Site rates object as a pandas DataFrame Raises: ValueError: If the number of site rates does not match the number of SNPs. """ if self.genotype_data.snp_data is None: _ = self.genotype_data.snp_data if len(rates) != self.genotype_data.num_snps: msg = f"Number of site rates != number of snps in the alignment: {len(rates)} != {self.genotype_data.num_snps}" self.logger.error(msg) raise ValueError(msg) def _siterates_from_file(self) -> List[float]: """Read site-specific substitution rates from a file. This method reads the site-specific substitution rates from a file and returns them as a list of float values. The site rates file should contain the site rates in a single column, with each rate on a separate line. For example: ``` 0.0000 0.0000 0.0000 0.0000 0.0000 ``` Returns: List[float]: List of site-specific substitution rates. """ with open(self.siterates, "r") as fin: lines = fin.readlines() sep = "," if "," in lines else r"\s+" header = lines[0] if lines[0].isalpha() else None line = lines[1].strip() if not line: msg = "Site rates file is empty." self.logger.error(msg) raise ValueError() ncol = len(line.split(sep)) if ncol > 1: msg = "Site rates file must have only one column." self.logger.error(msg) raise ValueError(msg) header = 0 if header else None dfs = pd.read_csv(self.siterates, sep=sep, header=header, names=["Rate"]) return dfs["Rate"].to_list() def _validate_qmat(self, qmat: pd.DataFrame) -> None: """Validate the Q matrix. This method validates the Q matrix to ensure it is a square matrix with the correct columns and index. The Q matrix should be a square matrix with columns and index in the order A, C, G, T. Args: qmat (pd.DataFrame): Q matrix as a pandas DataFrame. Raises: TypeError: If the Q matrix is not a pandas DataFrame. ValueError: If the Q matrix is empty. ValueError: If the Q matrix is not square. ValueError: If the Q matrix columns do not equal the index. ValueError: If the Q matrix columns are not in the order A, C, G, T. """ if not isinstance(qmat, pd.DataFrame): msg = "Q matrix must be a pandas DataFrame, but got: {type(qmat)}" self.logger.error(msg) raise TypeError(msg) if qmat.empty: msg = "Q matrix is empty after attempting to load from file." self.logger.error(msg) raise ValueError(msg) if qmat.shape[0] != qmat.shape[1]: msg = "Q matrix is not square: {qmat.shape}" self.logger.error(msg) raise ValueError(msg) if not all(qmat.columns == qmat.index): msg = ( "Q matrix columns must equal the index: {qmat.columns} != {qmat.index}" ) self.logger.error(msg) raise ValueError(msg) if not all(qmat.columns == ["A", "C", "G", "T"]): msg = "Q matrix columns must be in the order A, C, G, T: {qmat.columns}" self.logger.error(msg) raise ValueError(msg) @property def qmat(self) -> pd.DataFrame: """Get q-matrix object for a corresponding phylogenetic tree. This method reads the Q matrix from a file and returns it as a pandas DataFrame object. The Q matrix file can be in either comma-separated or whitespace-separated format. The Q matrix should be a square matrix with columns and index in the order A, C, G, T. Returns: pandas.DataFrame: The Q-matrix as a pandas DataFrame. """ if self._qmat is not None: self._validate_qmat(self._qmat) return self._qmat if self.qmatrix is None: msg = "Q matrix file path not provided." self.logger.error(msg) raise TypeError(msg) is_iqtree = False with open(self.qmatrix, "r") as fin: lines = fin.readlines() lines = [line.strip() for line in lines] lines = [line for line in lines if line] if ( lines and any("rate matrix q" in line.lower() for line in lines) and len(lines) >= 5 ): is_iqtree = True self._qmat = self._q_from_iqtree() if is_iqtree else self._q_from_file() self._validate_qmat(self._qmat) return self._qmat @qmat.setter def qmat(self, value: pd.DataFrame) -> None: """Set q-matrix for the corrresponding phylogenetic tree. This method sets the Q matrix for the corresponding phylogenetic tree. The Q matrix should be a square matrix with columns and index in the order A, C, G, T. The Q matrix must be provided as a pandas DataFrame object with the correct columns and index. Args: value (pd.DataFrame): The Q-matrix as a pandas.DataFrame. """ self._validate_qmat(value) self._qmat = value @property def site_rates(self) -> pd.DataFrame: """Get site rate data for phylogenetic tree. This method reads the site-specific substitution rates from a file and returns them as a list of float values. The site rates file should either contain the site rates in a single column, with each rate on a separate line, or in a table format with the rates in the 'Rate' column, as output by IQ-TREE. For example: ``` 0.0000 0.0000 0.0000 0.0000 0.0000 ``` OR: ``` # Any comment lines can be included here. Site Rate Cat C_rate 1 0.0000 1 0.0000 2 0.0000 1 0.0000 3 0.0000 1 0.0000 4 0.0000 1 0.0000 5 0.0000 1 0.0000 ``` Returns: pd.DataFrame: Site rates for the phylogenetic tree. """ if self._site_rates is not None: self._validate_rates(self._site_rates) return self._site_rates if self.siterates is None: msg = "Site rates file path not provided." self.logger.error(msg) raise TypeError(msg) is_iqtree = False with open(self.siterates, "r") as fin: lines = fin.readlines() lines = [line.strip() for line in lines] lines = [line for line in lines if line] lines = [line for line in lines if not line.startswith("#")] if any("c_rate" in line.lower() for line in lines) and any( "rate" in line.lower() for line in lines ): is_iqtree = True if is_iqtree: self._site_rates = self._siterates_from_iqtree() else: self._site_rates = self._siterates_from_file() # Filter site rates to only include rates for remaining loci after # alignment filtering. if self._loci_indices is None: self.loci_indices = np.ones(len(self.snp_data), dtype=bool) if np.count_nonzero(self.loci_indices) < len(self._site_rates): sr = np.array(self._site_rates) self._site_rates = sr[self.loci_indices].tolist() self._validate_rates(self._site_rates) return self._site_rates @site_rates.setter def site_rates(self, value: List[float]) -> None: """Set site_rates object. This method sets the site rates for the corresponding phylogenetic tree. The site rates should be provided as a list of float values. Args: value (List[float]): The site rates as a list of floats. """ self._validate_rates(value) self._site_rates = value @property def tree(self): """Get newick tree from provided path. This method reads the phylogenetic tree from the provided tree file path and returns it as a toytree object. If the tree file path is not provided, an exception is raised. Returns: toytree.tree: The toytree tree object. """ if self._tree is not None: return self._tree if self.treefile is None: msg = "Tree file path not provided." self.logger.error(msg) raise TypeError(msg) self._tree = self.read_tree() return self._tree @tree.setter def tree(self, value: tt.ToyTree) -> None: """Setter for newick tree data. This method sets the phylogenetic tree for the corresponding tree parser object. Args: value (toytree.tree): The tree object to set. """ self._tree = value