'''
Before you run this snakemake workflow, please ensure that:
0. All in all: you must install snakemake at the base environment in order to run this workflow.
1. FragGeneScanRs has been in your path, with intact functionality.
2. Binny snakemake workflow works fine, and the initialization of conda environment and database passed the check pass. 
3. Set up the CAT database.
4. figure out what is gonna be your file's name, put a list into it! [either from snakemake itself or by bash script controlling.]
5. Add CAT's conda environment bin to your path, with intact functionality.
6. Envrionment name "Kronatools" or a krona.yml in the envs directory.
'''

import os
import sys
import yaml
import urllib.request
from snakemake.utils import min_version
import logging

min_version("6.0")

shell.executable("bash")

# default configuration file.
configfile: "config/config.running.yaml"

# Some major configuration parameters and inputs.
assembly_path = config['Inputs']['assembly_path']
ALIGNMENT_FILES = config['Inputs']['alignment_files']

if config['Inputs']['depth_txt']:
    DEPTH_TEXT_FILE=config['Inputs']['depth_txt']
else:    
    DEPTH_TEXT_FILE = ""
OUTPUT_DIR = Path(config['outputdir'])
BATCH_NAME = config['Inputs']['batch_name']

BINNY_OUTPUT_DIR = config['binny_settings']['outputdir_binny']    
# For binny: some global names.
GLOBAL_CONFIG = "config.ChloroScan.yaml"
THREAD=config['threads']

WORKFLOW_DIR = Path(workflow.basedir)
# BINNY_DIR = WORKFLOW_DIR.parent.parent/"binny_Chloroscan" 

SCR_DIR = WORKFLOW_DIR/"scripts"

DEFAULT_DB_DIR = Path("/home/student.unimelb.edu.au/yuhtong/andy/MMA_organelle_metagenomics/databases")
DB_DIR = WORKFLOW_DIR.parent.parent/"databases" 
# For CAT taxonomy, some global name.
CAT_DB_DIR = DB_DIR/config['CAT_database']
CAT_TAXON_DIR = DB_DIR/config['CAT_taxonomy']
TAXA = CAT_TAXON_DIR/"names.dmp"

GLOBAL_PREFIX="OUT"
GLOBAL_PREFIX_contigs="out.CAT"
# create the output directory.
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# create temporary directory.
TMP_DIR = config['tmpdir']
if not os.path.isabs(TMP_DIR):
    TMP_DIR = OUTPUT_DIR/TMP_DIR

TMP_DIR.mkdir(parents=True, exist_ok=True) # what if we do want to prepare data and put it here?

# Krona environment name.
KRONA_ENV=config['Krona_env']
# Now rules are introduced. They are placed into the single snakefile in order to increase efficiency for debugging.
# Major output: MMA.done, branching summary results: cross_ref.xlsx and CAT_taxonomy_annotation.tsv. 
rule all:
    input:
        OUTPUT_DIR/"working/binny",
        OUTPUT_DIR/"working/summary/cross_ref.tsv",
        OUTPUT_DIR/"working/visualizations",
        OUTPUT_DIR/"Krona.html",
        OUTPUT_DIR/"ChloroScan.done"


# The most important two rules are: corgi and binny.
# SCRDIR, needs an absolute path?
rule corgi_prediction:
    input: 
        seqs = assembly_path 
    params:
        batch_size = config['corgi_settings']['batch_size'],
        min_length = config['corgi_settings']['minlen'],
        p_threshold = config['corgi_settings']['pthreshold'],
        bash_plastid_accession_script = Path(SCR_DIR/"corgi-gather_accession.sh"),
        python_fasta_writer = Path(SCR_DIR/"corgi-fasta_filter.py"),
        plastid_contigs_identifier = Path(TMP_DIR/"plastid_accession.txt"),
        output_dir=OUTPUT_DIR
    conda:
        "envs/corgi_env.yml"
    output:
        corgi_out = OUTPUT_DIR/"working/corgi/corgi-prediction.csv",
        plastid_contigs = OUTPUT_DIR/"working/corgi/plastid.fasta"
        # here may change the output to a directory for file writing!
    message:
        "Using the machine learning algorithm 'CORGI' to predict contig identity. Don't rise batch size too high."
    resources:
        mem_mb=20000
    threads:
        15
    log:
        OUTPUT_DIR/"logging_info/corgi.log"
    benchmark:
        OUTPUT_DIR/"logging_info/corgi_benchmarking.tsv"
    shell:
        """
        pwd
        echo CORGI output going to here: {output.corgi_out}
        touch {output.plastid_contigs}
        echo "corgi --file {input.seqs} --csv {output.corgi_out} --batch-size {params.batch_size} --min-length {params.min_length} --no-save-filtered"
        corgi --file {input.seqs} --csv {output.corgi_out} --batch-size {params.batch_size} --min-length {params.min_length} --no-save-filtered &> {log}
        ls -lh {output.corgi_out}
        
        header=1
        raw_plastid_contig_count=$(grep -o "plastid" {output.corgi_out} | wc -l)
        plastid_contig_count=$((raw_plastid_contig_count-header))

        zero=0
        if [ $plastid_contig_count -eq $zero ]; then
            # do something to store the csv file only, and stop any other things.
            echo "No plastid contigs found, rule exit."
            exit 0

        elif [ $plastid_contig_count -le 100 ]; then
            echo "bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier}"
            bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier}
            echo "python {params.python_fasta_writer} --assembly {input.seqs} --accession {params.plastid_contigs_identifier} --fasta_file_name {output.plastid_contigs}"
            python {params.python_fasta_writer} --assembly {input.seqs} --accession {params.plastid_contigs_identifier} --fasta_file_name {output.plastid_contigs}

            # Do some calculations here.
            prediction_lines=$(wc -l <{output.corgi_out})
            contig_total=$((prediction_lines-header))
            ls {params.output_dir}/working/corgi
            
            echo "Is it here?"
            plastid_match=$(grep -o ">" {output.plastid_contigs} | wc -l)

            echo "total classified contigs containing all 5 domains: $contig_total"> {params.output_dir}/corgi.summary.txt
            echo "total number of plastid contigs: $plastid_match">>{params.output_dir}/corgi.summary.txt
        else
            echo "bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier} -p {params.p_threshold}"
            bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier} -p {params.p_threshold}
            python {params.python_fasta_writer} --assembly {input.seqs} --accession {params.plastid_contigs_identifier} --fasta_file_name {output.plastid_contigs}
            # Do some calculations here.
            prediction_lines=$(wc -l <{output.corgi_out})
            contig_total=$((prediction_lines-header))
            
            plastid_match=$(grep -o ">" {output.plastid_contigs} | wc -l)

            echo "total classified contigs containing all 5 domains: $contig_total"> {params.output_dir}/corgi.summary.txt
            echo "total number of plastid contigs: $plastid_match">>{params.output_dir}/corgi.summary.txt
        fi
        
        """    

# Add one checkpoint here, I want to end the workflow if there are no plastid contigs.

rule binny_workflow:
    input:
        plastid_contigs = rules.corgi_prediction.output.plastid_contigs
        
    params:
        # binny_dir = BINNY_DIR,
        bamfiles = ALIGNMENT_FILES,
        depth_text = DEPTH_TEXT_FILE,
        # defaultxxx_config = BINNY_DIR/"config/config.default.yaml",
        # currxxx_config = BINNY_DIR/"config/config.ChloroScan.yaml",
        bash_create_yaml = Path(SCR_DIR/"create_yaml.sh"),
        # outputdir is a must-checked one!
        universal_length_cutoff = config['binny_settings']['universal_length_cutoff'],
        hdbscan_epsilon_range = config['binny_settings']['clustering']['epsilon_range'],
        hdbscan_min_samples_range = config['binny_settings']['clustering']['hdbscan_min_samples_range'],
        quality_min_completeness = config['binny_settings']['bin_quality']['min_completeness'],
        quality_start_completeness = config['binny_settings']['bin_quality']['start_completeness'],
        quality_min_purity = config['binny_settings']['bin_quality']['purity'],
        outputdir_within_binny = config['binny_settings']['outputdir_binny'], # within binny.
        threads=THREAD
         #this place shall be the output, relating to binny!
    output:
        dir_binny = directory(OUTPUT_DIR/"working/binny")
    message:
        "Use the binning algorithm binny to find all high-coverage bins with completeness>=72.5% and purity>=95%."
    conda:
        "envs/binny.yml"
    shell:
        """
        set -e
        echo "Current mamba environment." 
        # Check if plastids are present.
        # If so, then run binny workflow.
        if [ -s {input.plastid_contigs} ]; then
            if [ -z {params.depth_text} ]
            then
                [ -n $BINNY_DIR/config/config.default.yaml ] && echo "default config file for binny found."
                [ -z $BINNY_DIR ] && echo "default binny dir not found." 
                bash {params.bash_create_yaml} -a "{input.plastid_contigs}" -b "$BINNY_DIR/binny_algal_database" -d "$BINNY_DIR/config/config.default.yaml" -l "{params.bamfiles}" -o "{params.outputdir_within_binny}"\
                -e '{params.hdbscan_epsilon_range}' -m '{params.hdbscan_min_samples_range}' -c {params.quality_min_completeness}\
                -s {params.quality_start_completeness} -p {params.quality_min_purity} -u {params.universal_length_cutoff}\
                -y "$BINNY_DIR/config/config.ChloroScan.yaml"
            else
                [ -n $BINNY_DIR/config/config.default.yaml ] && echo "default config file for binny found."
                bash {params.bash_create_yaml} -a "{input.plastid_contigs}" -b "$BINNY_DIR/binny_algal_database" -d "$BINNY_DIR/config/config.default.yaml" -t "{params.depth_text}" -o "{params.outputdir_within_binny}"\
                -e '{params.hdbscan_epsilon_range}' -m '{params.hdbscan_min_samples_range}' -c {params.quality_min_completeness}\
                -s {params.quality_start_completeness} -p {params.quality_min_purity} -u {params.universal_length_cutoff}\
                -y "$BINNY_DIR/config/config.ChloroScan.yaml"
            fi

            $BINNY_DIR/binny -l -n "MMA_plastids" -t {params.threads} -r $BINNY_DIR/config/config.ChloroScan.yaml

            # must remove the config.yaml created at the end, so we need it to be changed with name.
            rm $BINNY_DIR/config/config.ChloroScan.yaml # finally you have to remove it, you don't want to cause confusion.
            echo 'The binny workflow is done, bins are within the binny folder, that may need to move out.'
            # chmod u+w {output.dir_binny}
            mv {BINNY_OUTPUT_DIR} {output.dir_binny}
            chmod u+w {output.dir_binny}
            
            # make sure to remove the bam files within the intermediary folder.
            rm -rf {output.dir_binny}/intermediary/*.bam
            echo "The binny's output has been moved to the default output directory."
        else
            echo "No plastids present. Create an empty directory for output."
            mkdir -p {output.dir_binny}
        fi
        """

# Add another checkpoint here, I want to cease the workflow if there are no bins made of plastid contigs.

rule CAT_taxonomy_identification_plus_annotation:
    input: 
        binny_output=OUTPUT_DIR/"working/binny"
    params:
        prefix=GLOBAL_PREFIX_contigs,
        path_to_contigs="intermediary/assembly.formatted.fa"
    output:
        CAT_output=directory(OUTPUT_DIR/"working/CAT")
        # contig_classification=OUTPUT_DIR/"working/CAT/out.CAT.contig2classification.txt"
    log:
        OUTPUT_DIR/"logging_info/CAT.log"
    benchmark:
        OUTPUT_DIR/"logging_info/CAT_benchmark.tsv"
    threads:
        8
    resources:
        mem_mb=24000
    conda:
        "envs/CAT_env.yml"
    shell:
        """
        mkdir -p {output}

        # Check to see if the binny directory is empty or not
        find {input.binny_output} -mindepth 1 -maxdepth 1
        if [ -f "{input.binny_output}/{params.path_to_contigs}" ] ; then
        # CAT running is totally different, we shall make this changed.
            CAT contigs -c {input.binny_output}/{params.path_to_contigs} -d {CAT_DB_DIR} -t {CAT_TAXON_DIR} --force --top 11 --I_know_what_Im_doing
            # CAT add_names -i {params.prefix}.contig2classification.txt -o {params.prefix}.contig2classification.named.txt -t {CAT_DB_DIR}/2021-01-07_taxonomy --only_official
            
            mv ./{params.prefix}.* {output.CAT_output}
        fi
        """

# this is the correct one.
rule documenting_binny_results:
    input: 
        # bins = OUTPUT_DIR/"working/binny"/BINNY_OUTPUT_DIR/"bins",
        contig_level_annotation = directory(OUTPUT_DIR/"working/CAT")
    params: 
        bins = Path(OUTPUT_DIR/"working/binny/bins"),
        assembly_files = OUTPUT_DIR/"working/binny/intermediary/assembly.formatted.fa",
        assembly_depth = OUTPUT_DIR/"working/binny/intermediary/assembly.contig_depth.txt",
        marker_gene_gff = OUTPUT_DIR/"working/binny/intermediary/annotation_CDS_RNA_hmms_checkm.gff",
        names_dump=TAXA,
        MMA_summary=OUTPUT_DIR/"MMA.summary.txt",
        ANNOT_FILE="out.CAT.contig2classification.txt",
        script_dir = SCR_DIR
    output:
        OUTPUT_DIR/"working/summary/cross_ref.tsv"
    benchmark:
        OUTPUT_DIR/"logging_info/summarize_binny.tsv"
    threads:
        5
    conda:
        "envs/documenting_binny.yml"
    message:
        "Run python script summarize_binny_results.py to record each bin's contig information."
    script:
        "{params.script_dir}/summarize_binny_results.py"

rule refine_bins:
    input:
      cross_ref = OUTPUT_DIR/"working/summary/cross_ref.tsv",
      original_bins = Path(OUTPUT_DIR/"working/binny"),
      CAT_prediction = Path(OUTPUT_DIR/"working/CAT"),
    params:
      BACTERIA_ID = "2",
      script_dir = SCR_DIR
    output:
      refine_dir = directory(OUTPUT_DIR/"working/refined_bins")
    conda:
      "envs/refinement.yml"
    script:
      "{params.script_dir}/refine_bins.py"

rule visualize_results:
    input:
        OUTPUT_DIR/"working/summary/cross_ref.tsv",
        rules.refine_bins.output.refine_dir,
    params:
        BATCH_NAME,
        script_dir = SCR_DIR,
        refine_bins_dir=OUTPUT_DIR/"working/refined_bins",
    output:
        directory(OUTPUT_DIR/"working/visualizations")
    conda:
        "envs/visualization.yml"
    script:
        "{params.script_dir}/visualization.py"

rule cds_extraction:
    input:
        binny_dir = Path(OUTPUT_DIR/"working/refined_bins")
    params:
        batch_name=BATCH_NAME,
        gff_file_flag="GFFs",
        script_dir = SCR_DIR,
        path_fraggenescanrs = OUTPUT_DIR/"working/FragGeneScanRs",
    output:
        CDS_EXTRACTION=directory(OUTPUT_DIR/"working/cds-extraction"),
        END_FLAG=OUTPUT_DIR/"ChloroScan.done"
    conda:
        "envs/gffread.yml"
    message:
        "Use the fast ORF-predictor FragGeneScan Rust to predict gene contents of the MAGs, then extract the cds via gffread."
    threads:
        4
    script:
        "{params.script_dir}/cds-extraction.sh"

rule krona_taxonomy_plot:
    input: 
        CAT_pred_whole_data=rules.CAT_taxonomy_identification_plus_annotation.output.CAT_output
    params:
        file_to_remodel="out.CAT.contig2classification.txt",
        file_after_remodel="out.CAT.krona.intake.tsv",
        intake_krona="out.CAT.krona.intake2.tsv",
        script_dir = SCR_DIR,
        batch_name=BATCH_NAME
    conda:
        KRONA_ENV if KRONA_ENV else "envs/krona.yml"
    output:
        OUTPUT_DIR/"Krona.html"
    message:
        "essure that in your env there must have pandas, click and of course, Kronatools."
    shell:
        """
        # add if statement: if there are empty out.CAT file, we exit with touched files: Krona.html.
        if [ ! -s {input}/{params.file_to_remodel} ]; then
            echo "No contig predictions because previous rules did not find any plastid contigs."
            touch {output}
            exit 0
        fi 

        python {params.script_dir}/create_taxonomy_profile.py --cat_prediction_txt {input}/{params.file_to_remodel} --output {input}/{params.file_after_remodel}
        sed 1d {input}/{params.file_after_remodel} > {input}/{params.intake_krona}
        ktImportTaxonomy -m 1 -o {output} {input}/{params.intake_krona}
        """
