Source code for samsifter.stats.compile_stats

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 30 12:25:44 2015

.. moduleauthor:: Florian Aldehoff <faldehoff@student.uni-tuebingen.de>
"""
import sys
if not (sys.version_info[0] >= 3):
    print("Error, I need python 3.x or newer")
    exit(1)

import argparse
import logging as log
from os.path import isfile
from os import listdir, remove
import re
import pandas as pd
import numpy as np
pd.set_option('max_columns', 50)


[docs]def main(): # parse arguments parser = argparse.ArgumentParser(description="compile statistics from " "temporary files") parser.add_argument('-v', '--verbose', required=False, action='store_true', help='print additional information to STDERR') parser.add_argument('-d', '--debug', required=False, action='store_true', help='print debug messages to STDERR') parser.add_argument('-r', '--remove', required=False, action='store_true', help='remove temporary statistics files after use') parser.add_argument('-p', '--prefix', required=False, default='reads_per_taxon', help='prefix of temporary statistics files') (args, remainArgs) = parser.parse_known_args() # configure logging if args.verbose: log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG) else: log.basicConfig(format="%(levelname)s: %(message)s") # create dict of stats files in working dir # using dict instead of list as steps may be omitted steps = {} for entry in listdir(): if isfile(entry): match = re.fullmatch('%s\.(\d{3})\.csv' % args.prefix, entry) if match: index = int(match.group(1)) steps[index] = entry log.info("Found %i temporary statistic files with prefix '%s'." % (len(steps), args.prefix)) if len(steps) > 0: readcounts = pd.DataFrame(dtype=np.float) # sort them by consecutive number for idx, filename in sorted(steps.items()): log.info("Gathering data from step %i (%s)" % (idx, filename)) df = pd.read_csv(filename, sep=',', # index_col=0, # better set index explicitly # engine='python', # C is faster, supports dtype engine='c', dtype={'taxon_id': str, 'read_count': np.float}, quotechar="'", quoting=2) df = df.set_index('taxon_id', drop=False) # print(df, file=sys.stderr) # merge stats into sparse array, assumes we never gain new taxa readcounts['read_count_%i' % idx] = df['read_count'] # print(df['read_count'], file=sys.stderr) # save array to CSV readcounts.to_csv(sys.stdout, sep=',', header=True, na_rep=0.0, quotechar="'", quoting=2) # remove stats files if args.remove: for filename in steps.values(): remove(filename) exit()
if __name__ == "__main__": main()