Source code for cocoatree.io

import warnings

from Bio import AlignIO
from Bio.PDB import PDBParser
from .msa import _clean_msa
from .__params import aatable
from ete3 import Tree
import numpy as np


[docs] def load_MSA(file_path, format="fasta", clean=True, verbose=False): """Read in a multiple sequence alignment (MSA) Parameters ---------- file_path : path to the alignment file format : string {"fasta", "phylip", …}, optional, default: "fasta" format of the alignment file (e.g. 'fasta', 'phylip', etc.) All format supported by biopython's Bio.AlignIO.read are accepted. clean : boolean, default=True whether to remove ambiguous amino acids (e.g. B, X etc.) verbose : boolean, whether to print informations about the MSA Returns ------- a dictionnary containing: - `sequences_id`, list of sequence identifiers - `alignment`: list of sequences as strings """ alignment = AlignIO.read(file_path, format) if clean: alignment = _clean_msa(alignment) sequences_id = [record.id for record in alignment] sequences = [str(record.seq) for record in alignment] if verbose: print('Number of sequences: %i' % len(alignment)) print('Alignment of length: %i' % len(alignment[0])) return {"sequence_ids": sequences_id, "alignment": sequences}
[docs] def load_tree_ete3(file_path): """ From the loading of a Newick tree, generate a ete3.Tree object Parameters ---------- file_path : path to the Newick file Returns ------- tree_ete3 : `ete3.Tree` object """ tree_ete3 = Tree(file_path, format=0) return tree_ete3
[docs] def export_fasta(sequences, sequences_id, outpath): """ Export intermediate files in FASTA format Parameters ---------- sequences : list of sequences as strings (as imported by load_MSA) sequences_id : list of sequences identifiers (as imported by load_MSA) outpath : path to the output file """ # Add checks to see if the path exists? Nseq = len(sequences) with open(outpath, 'w') as outfile: for record in range(0, Nseq): outfile.write('>' + str(sequences_id[record]) + '\n') outfile.write(str(sequences[record]) + '\n')
[docs] def load_pdb(path2pdb, pdb_id, chain): """ Read in a PDB file. Import a PDB file and extract the associated sequence along with the amino acid positions Parameters ---------- path2pdb : path to the PDB file pdb_id : str, identifier of the PDB file chain : str, name of the chain to read Returns ------- pbd_seq : str, amino acid sequence of the PDB file pdb_pos : list, PDB position of each amino acid """ with warnings.catch_warnings(): warnings.simplefilter("ignore") P = PDBParser(PERMISSIVE=1) structure = P.get_structure(pdb_id, path2pdb) # Fill up sequence and label information pdb_seq = "" pdb_pos = list() residues = [res for res in structure[0][chain] if res.get_id()[0] == " "] for res in residues: pdb_pos.append(str(res.get_id()[1]) + str(res.get_id()[2]).strip()) try: pdb_seq += aatable[res.get_resname()] except BaseException as e: print("Error: " + str(e)) pdb_seq += "X" return pdb_seq, pdb_pos
[docs] def export_sector_for_pymol(mapping, independent_components, axis, sector_pos_in_loaded_msa, sector_pos_in_filtered_msa, outpath): """ Export sector information for mapping on 3D structure in PyMOL. Export numpy arrays of a sector's residue positions and their contribution for coloring in PyMOL. Parameters ---------- mapping : numpy.ndarray, mapping between the unfiltered MSA and the PDB structure, output of cocoatree.msa.map_to_pdb() function independent_components : numpy.ndarray, output of cocoatree.deconvolution.compute_ica() function axis : int, rank of the independent component associated with the desired sector sector_pos_in_loaded_msa : list, positions of the sector's residues in the unfiltered MSA sector_pos_in_filtered_msa : numpy.ndarray, positions of the sector's residues in the filtered MSA, output from cocoatree.deconvolution.icList() function outpath : str, path to the output file as a binary in .npy format Returns ------- binary file in .npy format containing an array with the positions of the sector's residues and an array with their contribution to the independent component. """ sector_pdb_pos = [] for residue in sector_pos_in_loaded_msa: index = np.where(mapping[2] == str(residue))[0][0] sector_pdb_pos.append(mapping[1][index]) ic_contributions = [] for residue in sector_pos_in_filtered_msa: ic_contributions.append(independent_components[residue, axis]) np.save(outpath, np.array([sector_pdb_pos, ic_contributions]))