Source code for samsifter.stats.summarize_stats
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 1 18:50:57 2015
.. moduleauthor:: Florian Aldehoff <faldehoff@student.uni-tuebingen.de>
"""
import sys
if not (sys.version_info[0] >= 3):
print("Error, I need python 3.x or newer")
exit(1)
import argparse
import logging as log
from os.path import isfile
from os import listdir
import re
import pandas as pd
import numpy as np
[docs]def main():
# parse arguments
parser = argparse.ArgumentParser(description="compile statistics from "
"temporary files")
parser.add_argument('-v', '--verbose',
required=False,
action='store_true',
help='print additional information to STDERR')
parser.add_argument('-d', '--debug',
required=False,
action='store_true',
help='print debug messages to STDERR')
parser.add_argument('-p', '--prefix',
required=False,
default='reads_per_taxon',
help='prefix of temporary statistics files')
(args, remainArgs) = parser.parse_known_args()
# configure logging
if args.verbose:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
else:
log.basicConfig(format="%(levelname)s: %(message)s")
# create list of stats files in working dir
files = []
for entry in listdir():
if isfile(entry):
match = re.fullmatch('.*\.sifted\.csv', entry)
if match:
files.append(match.group(0))
log.info("Found %i statistic files." % len(files))
if len(files) > 0:
summary = pd.DataFrame(dtype=np.float)
for filename in sorted(files):
log.info("Gathering data from file %s" % filename)
# read file
df = pd.read_csv(filename,
sep=',',
# index_col=0, # better set index explicitly
# engine='python', # C is faster, supports dtype
engine='c',
dtype={'taxon_id': str},
quotechar="'",
quoting=2)
df = df.set_index('taxon_id', drop=False)
# identify last column
redux = df.ix[:, -1:]
redux.columns = [filename]
# filter taxa without reads
filtered = redux[redux[filename] > 0]
# merge it with summary file (outer join)
summary = pd.concat([summary, filtered], axis=1)
# save summary to CSV
summary.to_csv(sys.stdout,
sep=',',
header=True,
# na_rep=0.0,
quotechar="'",
quoting=2)
exit()
if __name__ == "__main__":
main()