Example script

The run_snpio.py script provides a template you can use to get started.

Just type:

python3 run_snpio.py

and it will run the example data.

Below is the code for the script:

import pandas as pd

from snpio import GenotypeEncoder, NRemover2, Plotting, TreeParser, VCFReader

# from snpio.utils.benchmarking import Benchmark


def main():
   # Read the alignment, popmap, and tree files.
   gd = VCFReader(
      filename="snpio/example_data/vcf_files/phylogen_subset14K_sorted.vcf.gz",
      popmapfile="snpio/example_data/popmaps/phylogen_nomx.popmap",
      force_popmap=True,  # Remove samples not in the popmap, or vice versa.
      chunk_size=5000,  # Number of lines to read into memory at a time.
   )

   # Run PCA and make missingness report plots.
   plotting = Plotting(genotype_data=gd)
   gd_components, gd_pca = plotting.run_pca()
   gd.missingness_reports()

   nrm = NRemover2(gd)

   # nrm.search_thresholds(
   #     thresholds=[0.25, 0.5, 0.75, 1.0],
   #     maf_thresholds=[0.0, 0.01, 0.025, 0.05],
   #     mac_thresholds=[2, 5],
   # )

   # # Plot benchmarking results.
   # NOTE: For development purposes. Comment out for normal use.
   # Benchmark.plot_performance(nrm.genotype_data, nrm.genotype_data.resource_data)

   gd_filt = (
      nrm.filter_missing_sample(0.75)
      .filter_missing(0.75)
      .filter_missing_pop(0.75)
      .filter_mac(2)
      .filter_monomorphic(exclude_heterozygous=False)
      .filter_singletons(exclude_heterozygous=False)
      .filter_biallelic(exclude_heterozygous=False)
      .resolve()
   )

   nrm.plot_sankey_filtering_report()

   # Make missingness report plots.
   plotting2 = Plotting(genotype_data=gd_filt)
   filt_components, filt_pca = plotting2.run_pca()
   gd_filt.missingness_reports(prefix="filtered")

   # Write the filtered VCF file.
   gd_filt.write_vcf("snpio/example_data/vcf_files/nremover_test.vcf")

   # Encode the genotypes into 012, one-hot, and integer formats.
   ge = GenotypeEncoder(gd_filt)
   gt_012 = ge.genotypes_012
   gt_onehot = ge.genotypes_onehot
   gt_int = ge.genotypes_int

   df012 = pd.DataFrame(gt_012)
   dfint = pd.DataFrame(gt_int)

   print(df012.head())
   print(gt_onehot[:5])
   print(dfint.head())

   tp = TreeParser(
      genotype_data=gd_filt,
      treefile="snpio/example_data/trees/test.tre",
      qmatrix="snpio/example_data/trees/test.iqtree",
      siterates="snpio/example_data/trees/test14K.rate",
      verbose=True,
      debug=False,
   )

   # Get a toytree object by reading the tree file.
   tree = tp.read_tree()

   # Get the tree stats. Returns a dictionary of tree stats.
   print(tp.tree_stats())

   # Reroot the tree at any nodes containing the string 'EA' in the sampleID.
   tp.reroot_tree("~EA")

   # Get a distance matrix between all nodes in the tree.
   print(tp.get_distance_matrix())

   # Get the Rate Matrix Q from the Qmatrix file.
   print(tp.qmat)

   # Get the Site Rates from the Site Rates file.
   print(tp.site_rates)

   # Get a subtree with only the samples containing 'EA' in the sampleID.
   subtree = tp.get_subtree("~EA")

   # Prune the tree to remove samples containing 'ON' in the sampleID.
   pruned_tree = tp.prune_tree("~ON")

   # Write the subtree and pruned tree. Returns a Newick string if 'save_path'
   # is None.
   print(tp.write_tree(subtree, save_path=None))
   print(tp.write_tree(pruned_tree, save_path=None))


if __name__ == "__main__":
   main()