#!python
#
#     Copyright (C) 2017–2026, Jose Manuel Martí Martínez
#
#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU Affero General Public License as
#     published by the Free Software Foundation, either version 3 of the
#     License, or (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#     GNU Affero General Public License for more details.
#
#     You should have received a copy of the GNU Affero General Public License
#     along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
Analyze metagenomic (functional) classification data.
"""
# pylint: disable=no-name-in-module, not-an-iterable
import argparse
import bz2
import collections as col
import multiprocessing as mp
import os
import pickle
import platform
import sys
import time
from typing import Counter, List, Dict, Set, Any

from recentrifuge import __version__, __author__, __date__
from recentrifuge.config import DEFMINGENE, GODUMP_PATH, NODES_FILE, NAMES_FILE
from recentrifuge.config import Filename, Sample, Id, Score, Scoring, Extra
from recentrifuge.config import Chart
from recentrifuge.config import HTML_SUFFIX, PKL_SUFFIX, TAXDUMP_PATH
from recentrifuge.config import STATS_SHEET_NAME, LICENSE, Err, Classifier
from recentrifuge.config import STR_CONTROL, STR_EXCLUSIVE, STR_SHARED
from recentrifuge.config import STR_CONTROL_SHARED
from recentrifuge.config import gray, red, green, yellow, blue, magenta
from recentrifuge.core import process_rank, summarize_analysis
from recentrifuge.gene_ontology import GeneOntology
from recentrifuge.go import process_goutput
from recentrifuge.krona import COUNT, UNASSIGNED, SCORE
from recentrifuge.krona import KronaTree
from recentrifuge.rank import Rank, TaxLevels
from recentrifuge.stats import SampleStats
from recentrifuge.trees import TaxTree, MultiTree, SampleDataById

import openpyxl
import pandas as pd


def _debug_dummy_plot(geno: GeneOntology,
                      htmlfile: Filename,
                      scoring: Scoring = Scoring.SHEL,
                      ):
    """

    Generate dummy Krona plot via Krona 2.0 XML spec and exit

    """
    print(gray(f'Generating dummy Krona plot {htmlfile}...'), end='')
    sys.stdout.flush()
    samples: List[Sample] = [Sample('SINGLE'), ]
    krona: KronaTree = KronaTree(samples,
                                 min_score=Score(35),
                                 max_score=Score(100),
                                 scoring=scoring,
                                 )
    polytree: MultiTree = MultiTree(samples=samples)
    polytree.grow(ontology=geno)
    polytree.toxml(ontology=geno, krona=krona)
    krona.tohtml(htmlfile, pretty=True)
    print(green('OK!'))


def main():
    """Main entry point to Regentrifuge."""

    def vprint(*arguments) -> None:
        """Print only if verbose/debug mode is enabled"""
        if args.debug:
            print(*arguments)

    def configure_parser():
        """Argument Parser Configuration"""
        parser = argparse.ArgumentParser(
            description=('Robust comparative analysis and contamination '
                         'removal for functional metagenomics'),
            epilog=f'%(prog)s - Release {__version__} - {__date__}' + LICENSE,
            formatter_class=argparse.RawDescriptionHelpFormatter
        )
        parser.add_argument(
            '-V', '--version',
            action='version',
            version=f'%(prog)s release {__version__} ({__date__})'
        )
        parser.add_argument(
            '-n', '--nodespath',
            action='store',
            metavar='PATH',
            default=GODUMP_PATH,
            help=('path for the nodes information files '
                  '(nodes.dmp and names.dmp adapted from GO)')
        )
        parser_filein = parser.add_mutually_exclusive_group(required=True)
        parser_filein.add_argument(
            '-f', '--file',
            action='append',
            metavar='FILE',
            type=Filename,
            help=('GO output files in tab separated columns with the format:'
                  'taxid, gi, go, score and evalue. Currently, the data for '
                  'the analysis is retrieved from the last 3 columns. '
                  ' Multiple -f is available to include several samples.')
        )
        parser_cross = parser.add_mutually_exclusive_group(required=False)
        parser_cross.add_argument(
            '-a', '--avoidcross',
            action='store_true',
            help='avoid cross analysis'
        )
        parser_cross.add_argument(
            '-c', '--controls',
            action='store',
            metavar='CONTROLS_NUMBER',
            type=int,
            default=0,
            help=('this number of first samples will be treated as negative '
                  'controls; default is no controls. CAUTION! Regentrifuge '
                  'direct support of control samples is experimental.')
        )
        parser_output = parser.add_argument_group(
            'output', 'Related to the output files')
        parser_output.add_argument(
            '-e', '--extra',
            action='store',
            metavar='OUTPUT_TYPE',
            choices=[str(extra) for extra in Extra],
            default=str(Extra(0)),
            help=(f'type of scoring to be applied, and can be one of '
                  f'{[str(extra) for extra in Extra]}')
        )
        parser_output.add_argument(
            '-o', '--outhtml',
            action='store',
            metavar='FILE',
            type=Filename,
            help='HTML output file (if not given the filename will be '
                 'inferred from input files)',
        )
        parser_output.add_argument(
            '-p', '--pickle',
            action='count',
            default=0,
            help=('pickle (serialize) statistics and data results in pandas '
                  'DataFrames (format affected by selection of --extra); one'
                  ' additional flag and the input samples will be serialized'
                  ' as a dict of sample names and TaxTree objects')
        )
        parser_mode = parser.add_argument_group('mode',
                                                'Specific modes of running')
        parser_mode.add_argument(
            '--dummy',  # hidden flag: just generate a dummy plot for JS debug
            action='store_true',
            help=argparse.SUPPRESS
        )
        parser_mode.add_argument(
            '-g', '--debug',
            action='store_true',
            help='increase output verbosity and perform additional checks'
        )
        parser_mode.add_argument(
            '--sequential',
            action='store_true',
            help='deactivate parallel processing'
        )
        parser_mode.add_argument(
            '-T', '--threads',
            action='store',
            metavar='NUMBER',
            type=int,
            default=0,
            help=('number of threads to use for parallel processing; '
                  '0 (default) means legacy mode using min(cpu_count, samples)')
        )
        parser_tuning = parser.add_argument_group(
            'tuning',
            'Fine tuning of algorithm parameters')
        parser_tuning.add_argument(
            '-m', '--mingene',
            action='store',
            metavar='INT',
            type=int,
            default=DEFMINGENE,
            help='minimum genes to avoid collapsing GO genes hierarchy levels'
        )
        parser_tuning.add_argument(
            '-i', '--include',
            action='append',
            metavar='TAXID',
            type=Id,
            default=[],
            help=('GO code to include a GO node and all underneath '
                  '(multiple -i is available to include several GOs); '
                  'by default all the GOs are considered for inclusion')
        )
        parser_tuning.add_argument(
            '-x', '--exclude',
            action='append',
            metavar='TAXID',
            type=Id,
            default=[],
            help=('GO code to exclude a GO node and all underneath '
                  '(multiple -x is available to exclude several GOs)')
        )
        parser_tuning.add_argument(
            '-s', '--scoring',
            action='store',
            metavar='SCORING',
            choices=[str(each_score) for each_score in Scoring],
            default=str(Scoring(0)),
            help=argparse.SUPPRESS,  # Different scoring still not supported
            # help=(f'type of scoring to be applied, and can be one of '
            #      f'{[str(scoring) for scoring in Scoring]}')
        )
        parser_tuning.add_argument(
            '-u', '--summary',
            action='store',
            metavar='OPTION',
            choices=['add', 'only', 'avoid'],
            default='add',
            help=('select to "add" summary samples to other samples, or to '
                  '"only" show summary samples or to "avoid" summaries at all')
        )
        parser_tuning.add_argument(
            '-t', '--takeoutroot',
            action='store_true',
            help='remove counts directly assigned to the "root" level'
        )
        parser_tuning.add_argument(
            '-y', '--minscore',
            action='store',
            metavar='NUMBER',
            type=lambda txt: Score(float(txt)),
            default=None,
            help=('minimum score/confidence of the annotation of a scaffold '
                  'to pass the quality filter; all pass by default')
        )
        return parser

    def get_num_processes(num_items: int) -> int:
        """Calculate number of processes for multiprocessing Pool.
        
        Args:
            num_items: The number of items to process (samples, ranks, etc.)
            
        Returns:
            Number of processes to use in the Pool.
            
        If --threads is 0 (default/legacy mode), uses min(cpu_count, num_items).
        If --threads > 0, uses min(threads, num_items) to avoid idle processes.
        """
        if args.threads == 0:
            # Legacy behavior: use min of CPU count and number of items
            cpu_count = os.cpu_count() or 1
            return min(cpu_count, num_items)
        else:
            # User specified number of threads
            return min(args.threads, num_items)

    def check_debug():
        """Check debugging mode"""
        if args.debug:
            print(gray('INFO: Debugging mode activated\n'))
        if args.threads > 0:
            print(blue('INFO:'), gray(f'Using {args.threads} thread(s) for parallel processing'))

    def read_samples():
        """Read samples"""
        print(gray('\nPlease, wait, processing files in parallel...\n'))
        # Enable parallelization with 'spawn' under known platforms
        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')
            with mpctx.Pool(processes=get_num_processes(
                                          len(input_files))) as pool:
                async_results = [pool.apply_async(
                    process_goutput,
                    args=[input_files[num],  # file name
                          True if num < args.controls else False],  # is ctrl?
                    kwds=kwargs
                ) for num in range(len(input_files))]
                for file, (sample, tree, out, stat, err) in zip(
                        input_files, [r.get() for r in async_results]):
                    if err is Err.NO_ERROR:
                        samples.append(sample)
                        trees[sample] = tree
                        goids[sample] = out.get_taxlevels()
                        if out.counts is not None:
                            counts[sample] = out.counts
                        if out.accs is not None:
                            accs[sample] = out.accs
                        if out.scores is not None:
                            scores[sample] = out.scores
                        stats[sample] = stat
                        mintaxas[sample] = stat.mintaxa
                    elif err is Err.VOID_CTRL:
                        print('There were void controls.', red('Aborting!'))
                        exit(1)
        else:  # sequential processing of each sample
            for num, file in enumerate(input_files):
                (sample, tree, out, stat, err) = process_goutput(
                    file, True if num < args.controls else False, **kwargs)
                if err is Err.NO_ERROR:
                    samples.append(sample)
                    trees[sample] = tree
                    goids[sample] = out.get_taxlevels()
                    if out.counts is not None:
                        counts[sample] = out.counts
                    if out.accs is not None:
                        accs[sample] = out.accs
                    if out.scores is not None:
                        scores[sample] = out.scores
                    stats[sample] = stat
                    mintaxas[sample] = stat.mintaxa
                elif err is Err.VOID_CTRL:
                    print('There were void controls.', red('Aborting!'))
                    exit(1)
        raw_samples.extend(samples)  # Store raw sample names

    def analyze_samples():
        """Cross analysis of samples in parallel by rank"""
        print(gray('Please, wait. Performing cross analysis in parallel...\n'))
        # Update kwargs with more parameters for the followings func calls
        kwargs.update({'taxids': goids, 'counts': counts, 'scores': scores,
                       'accs': accs, 'raw_samples': raw_samples})
        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')  # Important for OSX&Win
            with mpctx.Pool(processes=get_num_processes(len(
                    Rank.genomic_ranks))) as pool:
                async_results = [pool.apply_async(
                    process_rank,
                    args=[level],
                    kwds=kwargs
                ) for level in Rank.genomic_ranks]
                for level, (smpls, abunds, accumulators, score) in zip(
                        Rank.genomic_ranks,
                        [r.get() for r in async_results]):
                    samples.extend(smpls)
                    counts.update(abunds)
                    accs.update(accumulators)
                    scores.update(score)
        else:  # sequential processing of each selected rank
            for level in Rank.genomic_ranks:
                (smpls, abunds,
                 accumulators, score) = process_rank(level, **kwargs)
                samples.extend(smpls)
                counts.update(abunds)
                accs.update(accumulators)
                scores.update(score)

    def summarize_samples():
        """Summary of samples in parallel by type of cross-analysis"""
        # timing initialization
        summ_start_time: float = time.perf_counter()
        print(gray('Please, wait. Generating summaries in parallel...'))
        # Update kwargs with more parameters for the followings func calls
        kwargs.update({'samples': samples})
        # Get list of set of samples to summarize (note pylint bug #776)
        # pylint: disable=unsubscriptable-object
        target_analysis: col.OrderedDict[str, None] = col.OrderedDict({
            f'{raw}_{study}': None for study in [STR_EXCLUSIVE, STR_CONTROL]
            for raw in raw_samples
            for smpl in samples if smpl.startswith(f'{raw}_{study}')
        })
        # pylint: enable=unsubscriptable-object
        # Add shared and control_shared analysis if they exist (are not void)
        for study in [STR_SHARED, STR_CONTROL_SHARED]:
            for smpl in samples:
                if smpl.startswith(study):
                    target_analysis[study] = None
                    break

        if platform.system() and not args.sequential:  # Only for known systems
            mpctx = mp.get_context('fork')
            with mpctx.Pool(processes=get_num_processes(
                                          len(target_analysis))) as pool:
                async_results = [pool.apply_async(
                    summarize_analysis,
                    args=[analysis],
                    kwds=kwargs
                ) for analysis in target_analysis]
                for analysis, (summary, abund, acc, score) in zip(
                        target_analysis, [r.get() for r in async_results]):
                    if summary:  # Avoid adding empty samples
                        summaries.append(summary)
                        counts[summary] = abund
                        accs[summary] = acc
                        scores[summary] = score
        else:  # sequential processing of each selected rank
            for analysis in target_analysis:
                (summary, abund,
                 acc, score) = summarize_analysis(analysis, **kwargs)
                if summary:  # Avoid adding empty samples
                    summaries.append(summary)
                    counts[summary] = abund
                    accs[summary] = acc
                    scores[summary] = score
        # Timing results
        print(gray('Summary elapsed time:'),
              f'{time.perf_counter() - summ_start_time:.3g}', gray('sec'))

    def generate_krona():
        """Generate Krona plot with all the results via Krona 2.0 XML spec"""

        print(gray('\nBuilding the ontology multiple tree... '), end='')
        sys.stdout.flush()
        krona: KronaTree = KronaTree(samples,
                                     num_raw_samples=len(raw_samples),
                                     stats=stats,
                                     min_score=Score(
                                         min([min(scores[sample].values())
                                              for sample in samples
                                              if len(scores[sample])])),
                                     max_score=Score(
                                         max([max(scores[sample].values())
                                              for sample in samples
                                              if len(scores[sample])])),
                                     scoring=scoring,
                                     chart=Chart.GENOMIC,
                                     )
        polytree.grow(ontology=geno,
                      abundances=counts,
                      accs=accs,
                      scores=scores)
        print(green('OK!'))
        print(gray('Generating final plot (') + magenta(htmlfile) +
              gray(')... '), end='')
        sys.stdout.flush()
        polytree.toxml(ontology=geno, krona=krona)
        krona.tohtml(htmlfile, pretty=False)
        print(green('OK!'))

    def save_extra_output():
        """Save extra output with results via pandas DataFrame"""

        # Initial check and info
        if extra is Extra.FULL or extra is Extra.DYNOMICS:
            vprint(blue('INFO:'),
                   gray('Saving extra output as an Excel file.'))
        elif extra is Extra.CSV:
            vprint(blue('INFO:'),
                   gray('Saving extra output as CSV files.'))
        elif extra is Extra.TSV:
            vprint(blue('INFO:'),
                   gray('Saving extra output as TSV files.'))
        elif extra is Extra.MULTICSV:
            vprint(blue('INFO:'),
                   gray('Saving extra output as multiple CSV files.'))
        else:
            raise Exception(f'ERROR! Unknown Extra option "{extra}"')

        # Setup of extra file name
        xlsxwriter: pd.ExcelWriter | None = None
        sv_base: Filename = Filename(htmlfile.split('.html')[0])
        sv_ext: Filename = Filename('')  # Will be set below for CSV/TSV modes
        sv_sep: str = ','  # Default CSV separator
        data_frame: pd.DataFrame | None = None  # Initialize for pickle section
        if extra is Extra.FULL or extra is Extra.DYNOMICS:
            xlsx_name: Filename = Filename(htmlfile.split('.html')[0] +
                                           '.xlsx')
            print(gray(f'Generating Excel {str(extra).lower()} summary (') +
                  magenta(xlsx_name) + gray(')... '), end='')
            sys.stdout.flush()
            xlsxwriter = pd.ExcelWriter(xlsx_name)
        elif extra in [Extra.CSV, Extra.TSV, Extra.MULTICSV]:
            if extra in [Extra.CSV, Extra.MULTICSV]:
                sv_ext = Filename('csv')
                sv_sep = ','
            elif extra is Extra.TSV:
                sv_ext = Filename('tsv')
                sv_sep = '\t'
            print(gray(f'Generating {str(extra).lower()} extra output (') +
                  magenta('[' + sv_base + '.]*.' + sv_ext)
                  + gray(')... '), end='')
            sys.stdout.flush()
        else:
            raise Exception(f'ERROR! Unknown Extra option "{extra}"')
        odict_rows: Dict[Id, List] = col.OrderedDict()

        # Save basic statistics of raw samples
        stat_frame = pd.DataFrame.from_records(
            {raw: stats[raw].to_odict() for raw in raw_samples},
            index=list(stats[raw_samples[0]].to_odict().keys()))
        if extra is Extra.FULL or extra is Extra.DYNOMICS:
            stat_frame.to_excel(xlsxwriter, sheet_name=STATS_SHEET_NAME)
        elif extra in [Extra.CSV, Extra.TSV, Extra.MULTICSV]:
            stat_frame.to_csv(Filename(sv_base + '.stat.' + sv_ext), sep=sv_sep)

        # Save taxid related statistics per sample
        if extra in [Extra.FULL, Extra.CSV, Extra.TSV, Extra.MULTICSV]:
            polytree.to_odict(ontology=geno, odict=odict_rows)
            # Generate the pandas DataFrame from items and export to Extra
            iterable_1 = [samples, [COUNT, UNASSIGNED, SCORE]]
            cols1 = pd.MultiIndex.from_product(iterable_1,
                                               names=['Samples', 'Stats'])
            iterable_2 = [['Details'], ['Rank', 'Name']]
            cols2 = pd.MultiIndex.from_product(iterable_2)
            cols = cols1.append(cols2)
            data_frame = pd.DataFrame.from_dict(
                odict_rows, orient='index', columns=cols)
            data_frame.index.names = ['Id']
            if extra in [Extra.CSV, Extra.TSV]:
                data_frame.to_csv(Filename(sv_base + '.data.' + sv_ext), sep=sv_sep)
            elif extra is Extra.MULTICSV:
                for sample in samples:  # Only save taxa with counts per sample
                    sub_df = data_frame[
                        data_frame[sample, 'count'] > 0][[sample, 'Details']]
                    csv_name: Filename
                    # Shared samples need help (sv_base) to get the path right
                    if sample.startswith(STR_SHARED):
                        csv_name = Filename(sv_base + '_' + sample
                                            + '.data.' + sv_ext)
                    else:
                        csv_name = Filename(sample + '.data.' + sv_ext)
                    sub_df.to_csv(csv_name, sep=sv_sep)
            else:
                data_frame.to_excel(xlsxwriter, sheet_name=str(extra))
        elif extra is Extra.DYNOMICS:
            dyn_target_ranks: List[Rank] = [Rank.NO_RANK]
            if args.controls:  # if controls, add genomic ranks as targets
                dyn_target_ranks.extend(Rank.genomic_ranks)
            for rank in dyn_target_ranks:  # Once for no rank dependency (NO_RANK)
                indexes: List[int] = []  # Initialize as empty list
                sheet_name: str = 'RGF sheet'
                columns: List[str] = ['__Rank', '__Name']
                if args.controls:
                    indexes = [i for i in range(len(raw_samples), len(samples))
                               # Check if sample ends in _(STR_CONTROL)_(rank)
                               if (STR_CONTROL in samples[i].split('_')[-2:]
                                   and rank.name.lower() in
                                   samples[i].split('_')[-1:])]
                    sheet_name = f'{STR_CONTROL}_{rank.name.lower()}'
                    columns.extend([samples[i].replace(
                        '_' + STR_CONTROL + '_' + rank.name.lower(), '')
                        for i in indexes])
                if rank is Rank.NO_RANK:  # No rank dependency
                    indexes = list(range(len(raw_samples)))
                    sheet_name = f'raw_samples_{rank.name.lower()}'
                    columns.extend(raw_samples)
                polytree.to_odict(ontology=geno, odict=odict_rows,
                                  cmplxcruncher=True,
                                  sample_indexes=indexes)
                data_frame = pd.DataFrame.from_dict(odict_rows,
                                                    orient='index',
                                                    columns=columns)
                data_frame.index.names = ['Id']
                data_frame.to_excel(xlsxwriter, sheet_name=sheet_name)
        else:
            raise Exception(f'ERROR! Unknown Extra option "{extra}"')
        if xlsxwriter is not None:
            xlsxwriter.close()
        print(green('OK!'))

        # Serialize dataframes in compressed pickle files
        if args.pickle > 0:
            # Serialize dataframe with basic statistics of raw samples
            vprint(blue('INFO:'),
                   gray('Serializing dataframes with stats and data.'))
            pkl_file: Filename = Filename(sv_base + '.stat' + PKL_SUFFIX)
            print(gray(f'Generating output file ') +
                  magenta(pkl_file) + gray(')... '), end='')
            sys.stdout.flush()
            stat_frame.to_pickle(pkl_file, compression='infer', protocol=-1)
            print(green('OK!'))
            # Serialize dataframe with taxid related statistics per sample
            pkl_file = Filename(sv_base + '.data' + PKL_SUFFIX)
            print(gray(f'Generating output file ') +
                  magenta(pkl_file) + gray(')... '), end='')
            sys.stdout.flush()
            if data_frame is not None:
                data_frame.to_pickle(pkl_file, compression='infer', protocol=-1)
            print(green('OK!'))
            if args.pickle > 1:  # Serializing 'samples' as well
                vprint(blue('INFO:'),
                       gray('Serializing input sample\'s dict of TaxTree(s).'))
                pkl_file = Filename(sv_base + '.samples' + PKL_SUFFIX)
                print(gray(f'Generating output file ') +
                      magenta(pkl_file) + gray(')... '), end='')
                sys.stdout.flush()
                with bz2.BZ2File(pkl_file, 'wb') as fbz2:
                    pickle.dump(trees, fbz2, pickle.HIGHEST_PROTOCOL)
                print(green('OK!'))

    # timing initialization
    start_time: float = time.time()
    # Program header
    print(f'\n=-= {sys.argv[0]} =-= v{__version__} =-= {__date__} =-=\n')
    sys.stdout.flush()

    # Parse arguments
    argparser = configure_parser()
    args = argparser.parse_args()
    outputs: List[Filename] = args.file
    input_files: List[Filename]
    nodesfile: Filename = Filename(os.path.join(args.nodespath, NODES_FILE))
    namesfile: Filename = Filename(os.path.join(args.nodespath, NAMES_FILE))
    htmlfile: Filename = args.outhtml
    scoring: Scoring = Scoring[args.scoring]
    args.ctrlminscore = None
    args.ctrlmingene = None
    excluding: Set[Id] = set(args.exclude)
    including: Set[Id] = set(args.include)
    extra: Extra = Extra[args.extra]

    check_debug()
    input_files = outputs  # Done by select_inputs() in recentrifuge
    if htmlfile is None:   # Done by select_html_file() in recentrifuge
        htmlfile = Filename(outputs[0].split('_go')[0] + HTML_SUFFIX)

    # Load GO nodes, names and build children
    geno: GeneOntology = GeneOntology(nodesfile, namesfile,
                                    excluding, including, args.debug)

    # If dummy flag enabled, just create dummy krona and exit
    if args.dummy:
        _debug_dummy_plot(geno, htmlfile, scoring)
        exit(0)

    # Declare variables of known data before the core analysis
    samples: List[Sample] = []
    raw_samples: List[Sample] = []

    # Declare variables that will hold results for the samples analyzed
    trees: Dict[Sample, TaxTree] = {}
    counts: Dict[Sample, Any] = {}  # UnionCounter from SampleDataById
    accs: Dict[Sample, Counter[Id]] = {}
    goids: Dict[Sample, TaxLevels] = {}
    scores: Dict[Sample, Any] = {}  # UnionScores from SampleDataById
    stats: Dict[Sample, SampleStats] = {}
    mintaxas: Dict[Sample, int] = {}

    # Define dictionary of parameters for methods to be called (to be extended)
    kwargs = {'controls': args.controls,
              'ctrlminscore': (
                  args.ctrlminscore
                  if args.ctrlminscore is not None else args.minscore),
              'ctrlmintaxa': (
                  args.ctrlmingene
                  if args.ctrlmingene is not None else args.mingene),
              'debug': args.debug, 'root': args.takeoutroot,
              'minscore': args.minscore,
              'mintaxa': args.mingene, 'scoring': scoring, 'ontology': geno,
              }
    # Read the samples in parallel
    read_samples()
    # Update kwargs with more parameters for the followings func calls
    kwargs.update({'goids': goids, 'counts': counts, 'scores': scores,
                   'accs': accs, 'raw_samples': raw_samples,
                   'mintaxas': mintaxas})
    kwargs.pop('mintaxa')  # mintaxa info is included in mintaxas dict
    kwargs.pop('ctrlmintaxa')  # ctrlmintaxa info is included in mintaxas dict

    # Avoid cross analysis if just one report file or explicitly stated by flag
    if len(raw_samples) > 1 and not args.avoidcross:
        analyze_samples()
        if args.summary != 'avoid':
            summaries: List[Sample] = []
            summarize_samples()
            if args.summary == 'only':
                samples = raw_samples + summaries
            else:
                samples.extend(summaries)
    # Final result generation is done in sequential mode

    polytree: MultiTree = MultiTree(samples=samples)
    generate_krona()
    save_extra_output()

    # Timing results
    print(gray('Total elapsed time:'), time.strftime(
        "%H:%M:%S", time.gmtime(time.time() - start_time)))


if __name__ == '__main__':
    main()
