Source code for pyllelic.process

#!/usr/bin/env python3
"""Utilities to pre-process and prepare data for use in pyllelic."""

import gzip
import os
import re
import shutil
import subprocess
from pathlib import Path
from typing import Dict, List, Optional

import dask.dataframe as dd
import pandas as pd
import pysam
import requests
from Bio import SeqIO
from tqdm.auto import tqdm

from pyllelic.config import Config
from pyllelic.pyllelic import GenomicPositionData


[docs]class ShellCommandError(Exception): """Error for shell utilities that aren't installed."""
[docs]class FileNameError(Exception): """Error for invalid filetypes."""
[docs]def fastq_to_list(filepath: Path) -> List[SeqIO.SeqRecord]: """Read a .fastq or fastq.gz file into an in-memory record_list. This is a time and memory intensive operation! Args: filepath (Path): file path to a fastq.gz file Returns: List[SeqRecord]: list of biopython sequence records from the fastq file Raises: FileNameError: "Wrong filetype" """ if ".fastq" not in filepath.suffixes: raise FileNameError("Wrong filetype") record_list: List[SeqIO.SeqRecord] = [] if ".gz" in filepath.suffixes[-1]: with gzip.open(filepath, "rt") as handle: for record in SeqIO.parse(handle, "fastq"): record_list.append(record) return record_list if ".fastq" in filepath.suffixes[-1]: with open(filepath) as handle: for record in SeqIO.parse(handle, "fastq"): record_list.append(record) return record_list # If doesn't match readable suffixes raise FileNameError("Wrong filetype")
[docs]def make_records_to_dictionary( record_list: List[SeqIO.SeqRecord], ) -> Dict[str, SeqIO.SeqRecord]: """Take in list of biopython SeqRecords and output a dictionary with keys of the record name. Args: record_list (List[SeqRecord]): biopython sequence records from a fastq file Returns: Dict[str, SeqRecord]: dict of biopython SeqRecords from a fastq file """ return dict(zip([record.id for record in record_list], record_list))
[docs]def build_bowtie2_index(fasta: Path) -> str: """Helper function to run external bowtie2-build tool. Args: fasta (Path): filepath to fasta file to build index from Returns: str: output from bowtie2-build shell command, usually discarded Raises: ShellCommandError: bowtie2-build is not installed. """ command: List[str] = ["bowtie2-build", "index", os.fspath(fasta)] if shutil.which(command[0]) is None: raise ShellCommandError("bowtie2-build is not installed.") output: subprocess.CompletedProcess[str] = subprocess.run( command, capture_output=True, text=True, check=True ) out: str = output.stdout return out
[docs]def bowtie2_fastq_to_bam(index: Path, fastq: Path, cores: int) -> str: """Helper function to run external bowtie2-build tool. Args: index (Path): filepath to bowtie index file fastq (Path): filepath to fastq file to convert to bam cores (int): number of cores to use for processing Returns: str: output from bowtie2 and samtools shell command, usually discarded Raises: ShellCommandError: bowtie2 is not installed. """ command: List[str] = [ "bowtie2", "-p", str(cores), "-x", str(index), "-U", str(fastq), "|", "samtools", "view", "-bS", "-", ">", str(fastq.parent) + "/" + str(fastq.stem) + ".bam", ] if shutil.which(command[0]) is None: raise ShellCommandError("bowtie2 is not installed.") output: subprocess.CompletedProcess[str] = subprocess.run( command, capture_output=True, text=True, check=True ) out: str = output.stdout return out
[docs]def sort_bam(bamfile: Path) -> bool: """Helper function to run pysam samtools sort. Args: bamfile (Path): filepath to bam file Returns: bool: verification of samtools command, usually discarded """ pysam.sort( # type:ignore[attr-defined] "-o", f"{bamfile.parent}/{bamfile.stem}_sorted.bam", os.fspath(bamfile) ) return True
[docs]def index_bam(bamfile: Path) -> bool: """Helper function to run external samtools index. Args: bamfile (Path): filepath to bam file Returns: bool: verification of samtools command, usually discarded """ pysam.index(os.fspath(bamfile)) # type:ignore[attr-defined] return True
[docs]def retrieve_seq(filename: str, chrom: str, start: int, end: int) -> None: """Retrieve the genomic sequence of interest from UCSC Genome Browser. Args: filename (str): path to store genomic sequence chrom (str): chromosome of interest, e.g. "chr5" start (int): start position for region of interest end (int): end position for region of interest """ response = requests.get( f"https://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment={chrom}:{start},{end}" ) pattern = re.compile(r"<DNA.*>(.*)<\/DNA>", flags=re.DOTALL) match = pattern.findall(response.text) seq = match[0].replace("\n", "") Path(filename).write_text(seq)
[docs]def prepare_genome(index: Path, aligner: Optional[Path] = None) -> str: """Helper function to run external bismark genome preparation tool. Uses genomes from, e.g.: http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/ Bismark documentation at: https://github.com/FelixKrueger/Bismark/tree/master/Docs Args: index (Path): filepath to unprocessed genome file. aligner (Optional[Path]): filepath to bowtie2 alignment program. Returns: str: output from genome preparation shell command, usually discarded Raises: ShellCommandError: bismark_genome_preparation is not installed. """ command: List[str] if aligner: command = [ "bismark_genome_preparation", "--path_to_aligner", str(aligner), str(index), ] else: command = [ "bismark_genome_preparation", str(index), ] if shutil.which(command[0]) is None: raise ShellCommandError("bismark_genome_preparation is not installed.") output: subprocess.CompletedProcess[str] = subprocess.run( command, capture_output=True, text=True, check=True, cwd=index.parent, ) out: str = output.stdout return out
[docs]def bismark(genome: Path, fastq: Path) -> str: """Helper function to run external bismark tool. Bismark documentation at: https://github.com/FelixKrueger/Bismark/tree/master/Docs Args: genome (Path): filepath to directory of bismark processed genome files. fastq (Path): filepath to fastq file to process. Returns: str: output from bismark shell command, usually discarded Raises: ShellCommandError: bismark is not installed. """ command: List[str] = [ "bismark", "--genome", str(genome), str(fastq), ] if shutil.which(command[0]) is None: raise ShellCommandError("bismark is not installed.") output: subprocess.CompletedProcess[str] = subprocess.run( command, capture_output=True, text=True, check=True, cwd=fastq.parent, ) out: str = output.stdout return out
[docs]def convert_methbank_bed( path: Path, chrom: str, start: int, stop: int, viz: str = "plotly" ) -> GenomicPositionData: """Helper function to convert MethBank BED file into GenomicPositionData obj. MethBank: https://ngdc.cncb.ac.cn/methbank/ Args: path (Path): path to MethBank formatted BED file chrom (str): chromosome identifier start (int): genomic start position stop (int): genomic stop position viz (str): Plotting backend to use, defaults to plotly Returns: GenomicPositionData: mostly complete pyllelic object with data from BED. """ # FIXME: progress bar doesn't work with Dask with tqdm(total=8, desc="Processing .BED file") as pbar: df = dd.read_csv(path, sep="\t") # type:ignore[attr-defined] pbar.update(1) df = df[(df["#chr"] == chrom) & (df["start"] > start) & (df["start"] < stop)] pbar.update(1) df = df.compute() # Convert to pandas dataframe pbar.update(1) df["un_num"] = df["total_num"] - df["methy_num"] df["mean"] = df["percent_num"] pbar.update(1) df["mode"] = 0 # df["mode"] = df["mode"].where(df.methy_num >= df.un_num, 1) # dask-ism df.loc[df["methy_num"] >= df["un_num"], "mode"] = 1.0 # pandas df["diff"] = abs(df["mode"] - df["mean"]) pbar.update(1) my_ind_dict = {} for _, each in df.iterrows(): val = [] val.extend([1.0] * each["methy_num"]) val.extend([0.0] * each["un_num"]) my_ind_dict[each.start] = val ind_df = pd.DataFrame({str(k): [v] for k, v in my_ind_dict.items()}) pbar.update(1) empty = GenomicPositionData.__new__(GenomicPositionData) empty.files_set = [path.name] empty.file_names = [path.stem] empty.config = Config( promoter_start=start, promoter_end=stop, chromosome=chrom, offset=start, viz_backend=viz, ) empty.individual_data = ind_df pbar.update(1) empty.allelic_data = empty._generate_chisquared_test_df() empty.positions = empty.individual_data.columns.tolist() empty.means = df[["start", "mean"]].T.rename(columns=df["start"]).drop("start") empty.means.columns = empty.means.columns.astype(str) empty.modes = df[["start", "mode"]].T.rename(columns=df["start"]).drop("start") empty.modes.columns = empty.modes.columns.astype(str) empty.diffs = df[["start", "diff"]].T.rename(columns=df["start"]).drop("start") empty.diffs.columns = empty.diffs.columns.astype(str) pbar.update(1) return empty