Source code for cocoatree.datasets._base

import os
from ..io import load_MSA, load_pdb
import numpy as np
import pandas as pd
import gzip


[docs] def load_S1A_serine_proteases(paper='rivoire'): """ Load the S1A serine protease dataset Halabi dataset: 1470 sequences of length 832; 3 sectors identified Rivoire dataset : 1390 sequences of length 832 (snake sequences were removed for the paper's analysis); 6 sectors identified (including the 3 from Halabi et al, 2008) Parameters ---------- paper: str, either 'halabi' or 'rivoire' whether to load the dataset from Halabi et al, Cell, 2008 or from Rivoire et al, PLoS Comput Biol, 2016 Returns ------- a dictionnary containing : - `sequences_ids`: a list of strings corresponding to sequence names - `alignment`: a list of strings corresponding to sequences. Because it is an MSA, all the strings are of same length. - `metadata`: a pandas dataframe containing the metadata associated with the alignment. - `sector_positions`: a dictionnary of arrays containing the residue positions associated to each sector, either in Halabi et al, or in Rivoire et al. - `pdb_sequence`: sequence extracted from rat's trypsin PDB structure - `pdb_positions`: positions extracted from rat's trypsin PDB structure """ module_path = os.path.dirname(__file__) if paper == 'halabi': # Load the alignment used in Halabi et al, 2008 filename = os.path.join( module_path, "data/S1A_serine_proteases/halabi_alignment.fasta") data = load_MSA(filename, format="fasta") # Load the positions of the 3 sectors identified in Halabi et al, Cell, # 2008 filename = os.path.join( module_path, "data/S1A_serine_proteases/halabi_sectors.npz") sectors = np.load(filename) # Load the metadata filename = os.path.join( module_path, "data/S1A_serine_proteases/halabi_metadata.csv") metadata = pd.read_csv(filename) elif paper == 'rivoire': # Load the alignment used in Rivoire et al, 2016 filename = os.path.join( module_path, "data/S1A_serine_proteases/rivoire_alignment.fasta") data = load_MSA(filename, format="fasta") # Load the positions of the 6 sectors identified in Rivoire et al, PLoS # Comput Biol, 2016 filename = os.path.join( module_path, "data/S1A_serine_proteases/rivoire_sectors.npz") sectors = np.load(filename) # Load the metadata filename = os.path.join( module_path, "data/S1A_serine_proteases/rivoire_metadata.csv") metadata = pd.read_csv(filename) else: raise ValueError(f"invalid paper: {paper}. Options are 'halabi' or \ 'rivoire'") # Load the PDB structure filename = os.path.join( module_path, "data/S1A_serine_proteases/3tgi.pdb") pdb_sequence, pdb_positions = load_pdb(filename, '3TGI', 'E') data["sector_positions"] = sectors data["metadata"] = metadata data["pdb_sequence"] = pdb_sequence, data["pdb_positions"] = pdb_positions return data
[docs] def load_rhomboid_proteases(): """ Load the rhomboid protease dataset This dataset comes from Mihaljevic & Urban, Cell, 2020 (DOI: https://doi.org/10.1016/j.str.2020.07.015). Returns ------- a dictionnary containing : - `sequence_ids`: a list of strings corresponding to sequence names - `alignment`: a list of strings corresponding to sequences. Because it is an MSA, all the strings are of same length. - `sector_positions`: a dictionnary of arrays containing the residue positions associated to each sector as published in the original paper. - `pdb_sequence`: sequence extracted from E. coli's PDB structure - `pdb_positions`: positions extracted from E. coli's PDB structure """ module_path = os.path.dirname(__file__) filename = os.path.join( module_path, "data/rhomboid_proteases/Data_S1_Rhomboid_MSA_short_names.fasta") data = load_MSA(filename, format="fasta") filename = os.path.join( module_path, "data/rhomboid_proteases/rhomboid_sectors.npz") sectors = np.load(filename) # Load the metadata filename = os.path.join( module_path, "data/rhomboid_proteases/rhomboid_metadata_clean.csv") metadata = pd.read_csv(filename) # Load the PDB structure filename = os.path.join( module_path, "data/rhomboid_proteases/2NRF.pdb") # Two chains: A or B pdb_sequence, pdb_positions = load_pdb(filename, '2NRF', 'A') data["sector_positions"] = sectors data["metadata"] = metadata data["pdb_sequence"] = pdb_sequence, data["pdb_positions"] = pdb_positions return data
def load_DHFR(): """ load the DHFR dataset This dataset comes from Kalmer et al, The Journal of Physical Chemistry B, 2024 (https://pubs.acs.org/doi/10.1021/acs.jpcb.4c04195) Returns ------- a dictionnary containing : - `sequence_ids`: a list of strings corresponding to sequence names - `alignment`: a list of strings corresponding to sequences. Because it is an MSA, all the strings are of same length. - `sector_positions`: a dictionnary of arrays containing the residue positions associated to each sector as published in the original paper. - `pdb_sequence`: sequence extracted from E. coli's PDB structure - `pdb_positions`: positions extracted from E. coli's PDB structure """ module_path = os.path.dirname(__file__) filename = os.path.join( module_path, "data/DHFR/alignment.faa.gz") with gzip.open(filename, "rt") as f: data = load_MSA(f, format="fasta") filename = os.path.join( module_path, "data/DHFR/DHFR_sectors.npz") sectors = np.load(filename) # Load the PDB structure filename = os.path.join( module_path, "data/DHFR/3QL0.pdb") pdb_sequence, pdb_positions = load_pdb(filename, '3QL0', 'A') data["sector_positions"] = sectors data["pdb_sequence"] = pdb_sequence, data["pdb_positions"] = pdb_positions return data