#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Identify reference accessions with uneven coverage in MALT'ed SAM files.
.. moduleauthor:: Florian Aldehoff <faldehoff@student.uni-tuebingen.de>
"""
import sys
if not (sys.version_info[0] >= 3):
print("Error, I need python 3.x or newer")
exit(1)
import argparse
import logging as log
import fileinput
import tempfile
from os.path import basename, splitext
import re
import matplotlib as mpl
mpl.use('Agg') # before importing numpy, else trouble on headless server
import matplotlib.pyplot as plt
import numpy as np
""" custom libraries """
from samsifter.models.filter import FilterItem
from samsifter.models.parameter import (FilterParameter, FilterSwitch,
FilterThreshold, FilterFilepath)
from samsifter.util.arg_sanitation import (check_pos_float,
check_pos_float_max1,
check_pos_int, check_sam)
from samsifter.util.wrappers import line_filter
""" global variables """
TEXT = "Filter references by evenness of coverage"
DESC = ("Filters reference accessions exceeding a Gini Index threshold.")
[docs]def item():
"""Create item representing this tool in list and tree views.
Returns
-------
FilterItem
Item for use in item-based list and tree views.
"""
item = FilterItem(text=TEXT, desc=DESC)
item.set_command(splitext(basename(__file__))[0])
item.add_parameter(FilterThreshold(
text="min. Gini",
desc="minimum Gini coefficient",
cli_name='--min_gini',
default=0.00,
maximum=1.00
))
item.add_parameter(FilterThreshold(
text="max. Gini",
desc="maximum Gini coefficient",
cli_name='--max_gini',
default=1.00,
maximum=1.00,
required=True,
active=True
))
item.add_parameter(FilterParameter(
text="read lengths",
desc="analyze read length distributions",
cli_name="--read_lengths",
default=False
))
item.add_parameter(FilterParameter(
text="covered bases only",
desc="covered bases only, ignore uncovered bases",
cli_name="--covered",
default=False
))
item.add_parameter(FilterFilepath(
text="statistics file",
desc=("override filename for tab-separated statistics "
"[${filename}.stats.csv]"),
cli_name='--stats_file',
default="${filename}.stats.csv"
))
item.add_parameter(FilterParameter(
text="plot distribution(s)",
desc="plot coverage and/or read length distributions",
cli_name="--plot",
default=False
))
item.add_parameter(FilterThreshold(
text="LCI argument",
desc="fraction of mean coverage to use for LCI integration",
cli_name='--lci_arg',
default=0.50,
maximum=1.00
))
item.add_parameter(FilterThreshold(
text="min. LCI",
desc="minimum low-coverage index",
cli_name='--min_lci',
default=0.0,
maximum=1000.0
))
item.add_parameter(FilterThreshold(
text="max. LCI",
desc="maximum low coverage index",
cli_name='--max_lci',
default=1000.0,
maximum=1000.0
))
item.add_parameter(FilterThreshold(
text="min. coverage",
desc="minimum average coverage",
cli_name='--min_cov',
default=0,
maximum=9999,
precision=0
))
item.add_parameter(FilterThreshold(
text="max. coverage",
desc="maximum average coverage",
cli_name='--max_cov',
default=9999,
maximum=9999,
precision=0
))
item.add_parameter(FilterSwitch(
text="filter direction",
desc="Keep or discard entries passing the filter criteria?",
cli_name="--discard",
default=0,
options=["discard", "keep"]
))
item.add_parameter(FilterParameter(
text="verbose",
desc="print additional information to STDERR",
cli_name="--verbose",
default=True,
active=True
))
item.add_parameter(FilterParameter(
text="debug",
desc="print debug messages to STDERR",
cli_name="--debug",
default=False
))
return item
""" plotting methods """
[docs]def plot_rld(ax, length_dist, min_length, max_length, read_count,
length_dist_total):
"""
Create a bar plot of a read length distribution.
Includes scaled expected distribution based on all reads in file.
"""
ax.set_title('RLD')
ax.bar(np.arange(length_dist.size),
length_dist,
align='center',
width=0.8,
color='gray',
label='actual')
ax.plot(np.arange(min_length, max_length + 1, dtype=np.int),
# scale expected dist down
(length_dist_total * 1.0 / read_count) * np.sum(length_dist),
color='red',
linestyle='dashed',
label='expected')
ax.axis([min_length, max_length, 0, None])
ax.set_xlabel("read length [bp]")
ax.set_ylabel("number of reads")
[docs]def plot_cd(ax, depth_dist, avg_depth):
"""
Create a bar plot of a cumulative coverage distribution.
"""
ax.set_title("CD")
ax.bar(np.arange(depth_dist.size, dtype=np.int),
depth_dist,
align='center',
width=0.8,
color='gray')
ax.axis([-0.5, depth_dist.size - 0.5, 0, None])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
ax.set_ylabel("reference bases")
ax.legend(loc=1) # upper right
[docs]def plot_scd(ax, depth_dist, avg_depth, ref_length, max_depth):
"""
Create a bar plot of a scaled cumulative coverage distribution.
"""
depth_dist = depth_dist * 1.0 / ref_length
ax.set_title("SCD")
ax.bar(np.arange(depth_dist.size, dtype=np.int),
depth_dist,
align='center',
width=0.8,
color='gray')
ax.axis([-0.5, depth_dist.size - 0.5, 0, 1])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
ax.set_ylabel("fraction of reference bases")
ax.legend(loc=1) # upper right
[docs]def plot_ccd(ax, depth_dist_cumsum, avg_depth):
"""
Create a bar plot of a cumulative coverage distribution.
"""
ax.set_title("CCD")
ax.bar(np.arange(depth_dist_cumsum.size, dtype=np.int),
depth_dist_cumsum,
align='center',
width=0.8,
color='gray')
ax.axis([-0.5, depth_dist_cumsum.size - 0.5, 0, None])
# ax.axis([None, None, 0, None])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
# ax.set_xticks(None, None, 1.0)
ax.set_ylabel("cumulative bases")
[docs]def plot_sccd(ax, depth_dist_cumperc, avg_depth):
"""
Create a bar plot of a scaled cumulative coverage distribution.
"""
ax.set_title("SCCD")
ax.bar(np.arange(depth_dist_cumperc.size),
depth_dist_cumperc,
align='edge',
width=1,
color='gray')
ax.axis([0, depth_dist_cumperc.size - 1, 0, None])
ax.axvline(x=avg_depth,
color='green',
label="avg. coverage: %.2fx" % (avg_depth,))
ax.set_xlabel("coverage")
ax.set_ylabel("cumulative fraction of alignment")
[docs]def plot_nccd(ax, depth_dist_cumperc, avg_depth, max_depth, avg_depth_scaled,
lci, color):
"""
Create a bar plot of a normalized cumulative coverage distribution.
Includes legend stating average scaled and total depth
"""
ax.set_title("NCCD")
ax.bar(np.arange(depth_dist_cumperc.size) * 1.0 / max_depth,
depth_dist_cumperc,
align='edge',
width=1.0 / (depth_dist_cumperc.size - 1),
color=color)
ax.axis([0, 1, 0, 1])
ax.axvline(x=avg_depth_scaled,
color='green',
label="avg. coverage: %.2f [%.2fx]" % (avg_depth_scaled,
avg_depth))
ax.axvline(x=lci * avg_depth_scaled,
linestyle='dashed',
color='green',
label="%.2f x avg. cov.: %.2f [%.2fx]" % (
lci, lci * avg_depth_scaled, lci * avg_depth))
ax.set_xlabel("fraction of max coverage")
ax.set_ylabel("cumulative fraction of alignment")
ax.legend(loc=4) # lower right
[docs]def plot_lorenz(ax, x, y):
"""
Create a Lorenz curve plot of a coverage distribution.
"""
(gini, auc) = get_gini_auc(x, y)
ax.set_title("Lorenz")
# show equal distribution
ax.plot([0, 1],
[0, 1],
color='green',
label="equal distribution")
# show actual distribution
ax.plot(x, y, label="actual distribution")
ax.fill(x, y, color='gray', alpha=0.2)
ax.axis([0, 1, 0, 1])
ax.set_xlabel("fraction of coverage bins")
ax.set_ylabel("fraction of alignment")
ax.text(0.05, 0.75, "Gini = %.2f" % (gini,), transform=ax.transAxes)
ax.legend(loc=2) # upper left
[docs]def plot_lorenz_b2b(ax, x, y):
"""
Create a Lorenz curve plot of a coverage distribution.
"""
(gini, auc) = get_gini_auc(x, y)
ax.set_title("Lorenz base2base")
# show equal distribution
ax.plot([0, 1],
[0, 1],
color='green',
label="equal distribution")
# show actual distribution
ax.plot(x, y, label="actual distribution")
ax.fill(x, y, color='gray', alpha=0.2)
ax.axis([0, 1, 0, 1])
ax.set_xlabel("fraction of reference bases")
ax.set_ylabel("fraction of aligned read bases")
ax.text(0.05, 0.75, "Gini = %.2f" % (gini,), transform=ax.transAxes)
ax.legend(loc=2) # upper left
""" Integration methods """
[docs]def integral_discrete(dist, limit):
"""
Integrate discrete distribution with stepsize 1 by simply adding up values.
"""
return np.sum(dist[0:limit]) / 100.0
[docs]def integral_scaled(dist, limit):
"""
Integrate scaled discrete distribution with arbitrary stepsize.
"""
area = 0
# prevent division by zero (for cases with 100% 1x coverage)
if dist.size == 1:
return 1.0 * limit
# scale limit by max depth
if limit <= dist.size - 1:
limit_scaled = limit * 1.0 / (dist.size - 1)
else:
limit_scaled = 1.0
# print("limit\t%f" % (limit_scaled,))
stepsize = 1.0 / (dist.size - 1)
x = stepsize
for y in np.nditer(dist):
if limit_scaled > x:
area += stepsize * y
# print("area for x = %f\t%f" % (x, area))
else:
area += (limit_scaled - (x - stepsize)) * y
# print("area for x = %f\t%f" % (limit_scaled, area))
break
x += stepsize
return area
[docs]def lorenzify(depth_dist):
"""
Calculate Lorenz curve from coverage distribution.
"""
reads_sum = np.sum(depth_dist)
dd_scaled = depth_dist * 1.0 / reads_sum
dd_sorted = np.sort(dd_scaled)
dd_cum = np.cumsum(dd_sorted)
# append (0|0) origin to beginning of arrays
y = np.append(np.array([0.0]), dd_cum)
x = np.append(np.array([0.0]),
(np.arange(depth_dist.size) + 1) * 1.0 / depth_dist.size)
return (x, y)
[docs]def lorenzify_b2b(depth_dist, ignore_uncovered=True):
"""
Calculate Lorenz curve from base2base coverage distribution.
"""
# start = time.time()
ali_total = 0
ref_total = 0
# base2base_list = []
ali_bases_list = []
ref_bases_list = []
for (coverage, count) in enumerate(depth_dist):
if ignore_uncovered:
if coverage == 0:
continue
ali_bases = count * coverage
ali_bases_list.append(ali_bases)
ref_bases_list.append(count)
ali_total += ali_bases
ref_total += count
# base2base_list.append((aligned_bases, count))
# turn lists into arrays and scale them
b = np.array(ref_bases_list) * 1.0 / ref_total
v = np.array(ali_bases_list) * 1.0 / ali_total
triples = np.array([b, v, v/b])
ttriples = np.transpose(triples)
# triples.sort(axis = 0) # not working...
tttriples = np.transpose(ttriples[ttriples[:, 2].argsort()])
# sorted_pairs = sorted(base2base_list)
# sorted_aligned_bases = []
# sorted_ref_bases = []
# for pair in sorted_pairs:
# sorted_aligned_bases.append(pair[0])
# sorted_ref_bases.append(pair[1])
#
# x = np.cumsum(sorted_ref_bases)
# y = np.cumsum(sorted_aligned_bases)
x = np.cumsum(tttriples[0])
y = np.cumsum(tttriples[1])
# append (0|0) origin to beginning of arrays, scale both axes on the fly
x = np.append(np.array([0]), x) # * 1.0 / total_ref_bases
y = np.append(np.array([0]), y) # * 1.0 / total_aligned_bases
# stop = time.time()
# print("lorenzify_b2b took %f s for %i bp aligned to a %i bp reference"
# % (stop - start, ali_total, ref_total))
return (x, y)
[docs]def get_gini_auc(x, y):
"""
Calculate Gini coefficient and area under Lorenz curve.
Expects arrays of x and y coordinates for a normalized Lorenz distribution.
"""
# calculate area under curve
auc = 0.0
for (idx, value) in enumerate(x):
# no area at origin (0|0)
if idx == 0:
continue
square = (x[idx] - x[idx - 1]) * y[idx - 1]
triangle = ((x[idx] - x[idx - 1]) * (y[idx] - y[idx - 1])) / 2.0
auc = auc + square + triangle
# calculate Gini coefficient (area under normalized equal dist is 0.5)
# gini = (0.5 - auc) / 0.5
gini = 1.0 - (2 * auc) # equivalent, but faster?
return (gini, auc)
[docs]def covered_length_from_cigar(cigar):
"""
Calculate length of reference covered by read from CIGAR operations.
- not counting padding, skipping or insertions into the reference
- not counting hard or soft clipped bases of the read
"""
regex = re.compile("(\d+)([MIDNSHP=X])")
length = 0
for match in regex.finditer(cigar):
if match.group(2) in ("M", "=", "X", "D"):
length += int(match.group(1))
return length
[docs]def calc_avg_depth(depth_dist, ignore_uncovered=True):
"""
Calculate average depth from a coverage distribution.
Optionally ignores uncovered bases (first array element).
"""
sum_bases = 0
sum_cover = 0
for (coverage, count) in enumerate(depth_dist):
if ignore_uncovered:
if coverage == 0:
continue
sum_bases += count
sum_cover += coverage * count
return sum_cover / (sum_bases * 1.0)
[docs]def main():
# parse arguments
parser = argparse.ArgumentParser(description=DESC)
# parser.add_argument("sam",
# type=check_sam,
# help="SAM file to be analysed")
parser.add_argument('-i', '--input',
type=check_sam,
help="specify SAM file to be analysed (default: "
"STDIN)",
required=False)
parser.add_argument('-r', '--read_lengths',
required=False,
action='store_true',
help='analyze read length distributions')
parser.add_argument('-c', '--covered',
required=False,
action='store_true',
help='covered bases only, ignore uncovered bases')
parser.add_argument('-v', '--verbose',
required=False,
action='store_true',
help='print additional information to stderr')
parser.add_argument('-d', '--debug',
required=False,
action='store_true',
help='print debug messages to stderr')
parser.add_argument('--discard',
type=int,
help="keep or discard entries passing the filter "
"criteria?",
required=False,
default=0)
parser.add_argument('-p', '--plot',
required=False,
action='store_true',
help='plot distribution(s)')
parser.add_argument('-l', '--lci_arg',
required=False,
type=check_pos_float_max1,
help='fraction of mean coverage to use for LCI \
integration',
default=0.5)
parser.add_argument('-s', '--stats_file',
required=False,
help="override filename for tab-separated statistics")
acc_thresholds = parser.add_mutually_exclusive_group(required=False)
acc_thresholds.add_argument('--min_cov',
type=check_pos_int,
help='minimum average coverage',
default=0)
acc_thresholds.add_argument('--max_cov',
type=check_pos_int,
help='maximum average coverage',
default=sys.maxsize)
acc_thresholds.add_argument('--min_lci',
type=check_pos_float,
help='minimum LCI',
default=0.0)
acc_thresholds.add_argument('--max_lci',
type=check_pos_float,
help='maximum LCI',
default=1000.0)
acc_thresholds.add_argument('--min_gini',
type=check_pos_float_max1,
help='minimum Gini coefficient',
default=0.0)
acc_thresholds.add_argument('--max_gini',
type=check_pos_float_max1,
help='maximum Gini coefficient',
default=1.0)
(args, remainArgs) = parser.parse_known_args()
# configure logging
if args.verbose:
log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
else:
log.basicConfig(format="%(levelname)s: %(message)s")
# if args.debug:
# from pprint import pprint
# set filename for statistics output
if args.stats_file:
stats_filename = args.stats_file
elif args.input:
stats_filename = args.input + ".stats.csv"
else:
# this assumes that filename was explicitly set by calling batch script
# TODO check effect when run in normal pipe without specifying filename
stats_filename = "${filename}.stats.csv"
# initialize dicts
depths = {} # dict of np.array of ints
ref_lengths = {} # dict of ints
read_counts = {} # dict of ints
if args.read_lengths:
read_lenghts = {} # dict of dict of ints
read_lenghtsTotal = {} # dict of ints
line_count = 0
positions = {} # dict of list for references and their line numbers
buffer = None # file handle used to buffer complete STDIN
# parse SAM file and divide reads by reference accession
log.info("START of filtering reads by conservation.")
# parse SAM file
log.info("STEP 1: parsing SAM records.")
try:
"""
open SAM file from either command line argument or STDIN
use temporary file as buffer if reading from STDIN
"""
if args.input:
handle = open(args.input, 'r')
else:
handle = fileinput.input(remainArgs)
buffer = tempfile.TemporaryFile(mode='w+')
for line_nr, line in enumerate(handle, 0):
line_count = line_nr + 1
if buffer is not None:
buffer.write(line)
if not line.startswith("@"):
fields = line.split("\t")
"""
# content description
===========================
0 QNAME
1 FLAG
2 RNAME
3 POS
4 MAPQ
5 CIGAR
6 RNEXT
7 PNEXT
8 TLEN
9 SEQ
10 QUAL
...optional fields...
11 AS alignment score
12 NM edit distance to reference
13 ZL reference length
14 ZR "raw score"
15 ZE "expected"
16 ZI percent identify (incl. indels and deaminated)
17 MD mismatching positions
18 NS PMD score
"""
# extract accession from required RNAME field
# (as written by MALT 0.0.3)
rname_parts = fields[2].split("|")
"""
# content
===============
0 gi
1 GenBank ID
2 ref
3 accession
4 tax
5 taxonomy ID
"""
accession = ""
try:
accession = rname_parts[3]
except IndexError:
accession = "unspecified accession"
log.warn("no accession in line %s" % (line_nr + 1),
"run MALT version 0.0.3 or higher")
# extract reference sequence length from optional ZL field
ref_length = 0
try:
zl_parts = fields[13].split(":")
ref_length = int(zl_parts[2])
except IndexError:
log.error("no ZL field in line" % (line_nr + 1))
exit(1)
"""
read ------------------------------
ref -----------------------------------------------------
cover 00000000000011111111111111111111111111111100000000000
pre | covered | post
start stop
"""
# get 1-based start position of read from required POS field
pre = int(fields[3]) - 1
# obtain stop position from required CIGAR field
covered = covered_length_from_cigar(fields[5])
stop = pre + covered
# post = ref_length - covered - pre
# # usage of Numpy arrays too slow and memory-consuming :-/
# log.info("%s\t%i\t%i\t%i" % (accession, pre, covered, post))
#
# coverage = np.concatenate((np.zeros((pre,), dtype=np.int),
# np.ones((covered,), dtype=np.int),
# np.zeros((post,), dtype=np.int)),
# axis=0)
#
# # add read to overall coverage of current accession
# if accession in depths:
# depths[accession] = np.add(depths[accession], coverage)
# else:
# depths[accession] = coverage
if accession in depths:
read_counts[accession] += 1
for position in range(pre, stop + 1):
if position in depths[accession]:
depths[accession][position] += 1
else:
depths[accession][position] = 1
else:
ref_lengths[accession] = ref_length
read_counts[accession] = 1
for position in range(pre, stop + 1):
depths[accession] = {position: 1}
# optional: read length distribution
if args.read_lengths:
readLen = len(fields[9])
if readLen in read_lenghtsTotal:
read_lenghtsTotal[readLen] += 1
else:
read_lenghtsTotal[readLen] = 1
if accession in read_lenghts:
if readLen in read_lenghts[accession]:
read_lenghts[accession][readLen] += 1
else:
read_lenghts[accession][readLen] = 1
else:
read_lenghts[accession] = {readLen: 1}
# remember occurence of accession and line number(s)
try:
positions[accession].append(line_nr)
except KeyError:
positions[accession] = [line_nr, ]
finally:
handle.close()
buffer.seek(0)
log.info("%i reads are aligned to %i references"
% (line_count, len(depths)))
# get overall read length statistics
if args.read_lengths:
max_length_total = max(read_lenghtsTotal.keys())
min_length_total = min(read_lenghtsTotal.keys())
length_dist_total = np.zeros(
(max_length_total - min_length_total + 1, ), dtype=np.int)
for (length, count) in read_lenghtsTotal.items():
length_dist_total[length - min_length_total] = count
log.info("read lengths range from %i to %i bp"
% (min_length_total, max_length_total))
# iterate over accessions and calculate statistics for filtering
log.info("STEP 2: determining accessions matching the filter criteria.")
lines = []
max_depths = []
max_lengths = []
with open(stats_filename, "w") as handle:
# create statistics header
header = "accession\treference length [bp]\treads"
if args.read_lengths:
header += "\tmin length [bp]\tmax length [bp]\tmean length [bp]"
header += "\tmin coverage\tmax coverage\tmean coverage"
if args.covered:
header += "\tmin coverage*\tmax coverage*\tmean coverage*"
header += "\tLCI(%.2f)\tGini" % (args.lci_arg)
if args.covered:
header += "\tLCI(%.2f)*\tGini*" % (args.lci_arg)
print(header, file=handle)
for accession in depths:
ref_length = ref_lengths[accession]
read_count = read_counts[accession]
# calculate coverage statistics
max_depth = max(depths[accession].values())
max_depths.append(max_depth)
# determine depth distribution
uncovered_bases = ref_length
depth_dist = np.zeros((max_depth + 1,), dtype=np.int)
for (pos, depth) in depths[accession].items():
depth_dist[depth] += 1
uncovered_bases -= 1
depth_dist[0] = uncovered_bases
if args.debug:
print(depth_dist)
min_depth = 0
if uncovered_bases == 0:
min_depth = min(depths[accession].values())
avg_depth = calc_avg_depth(depth_dist, False)
avg_depth_scaled = avg_depth * 1.0 / max_depth
# convert counts to percentage of total reads (normalize y)
depth_dist_cumsum = np.cumsum(depth_dist)
depth_dist_cumperc = depth_dist_cumsum * 1.0 / (ref_length)
# integrate distribution as lci(0.5)
limit = args.lci_arg * avg_depth
# lci = integral_discrete(depth_dist_cumperc, limit)
lci = integral_scaled(depth_dist_cumperc, limit)
# # calculate Gini coefficient from Lorenz curve
# (x,y) = lorenzify(depth_dist)
# (gini, auc) = get_gini_auc(x, y)
# alternative Lorenz curve using base to base assignments
(u, v) = lorenzify_b2b(depth_dist, False)
if args.debug:
print("x (%i entries): %s" % (len(u), u))
print("y (%i entries): %s" % (len(v), v))
(gini_b2b, auc_b2b) = get_gini_auc(u, v)
# TODO calculate Theil index, see
# https://en.wikipedia.org/wiki/Theil_Index
# optional: coverage stats for covered bases only
if args.covered:
min_depth_covered = min(depths[accession].values())
if uncovered_bases == 0:
min_depth_covered = min_depth
avg_depth_covered = calc_avg_depth(depth_dist, True)
# avg_depth_covered_scaled = avg_depth_covered * 1.0 / max_depth
depth_dist_covered_cumsum = np.cumsum(depth_dist[1:])
depth_dist_covered_cumperc = (depth_dist_covered_cumsum * 1.0
/ (ref_length - uncovered_bases))
limit_covered = args.lci_arg * avg_depth_covered
lci_covered = integral_scaled(depth_dist_covered_cumperc,
limit_covered)
(m, n) = lorenzify_b2b(depth_dist, True)
if args.debug:
print("x (%i entries): %s" % (len(m), m))
print("y (%i entries): %s" % (len(n), n))
(gini_b2b_covered, auc_b2b_covered) = get_gini_auc(m, n)
# optional: read length stats
if args.read_lengths:
max_length = max(read_lenghts[accession].keys())
min_length = min(read_lenghts[accession].keys())
avg_length = np.mean(np.fromiter(
read_lenghts[accession].keys(), dtype=np.int,
count=len(read_lenghts[accession])))
max_lengths.append(max_length)
# determine length distribution
length_dist = np.zeros((max_length + 1,), dtype=np.int)
for (length, count) in read_lenghts[accession].items():
length_dist[length] = count
# compile accession statistics
stats = "%s\t%i\t%i" % (accession, ref_length, read_count)
if args.read_lengths:
stats += "\t%i\t%i\t%f" % (min_length, max_length, avg_length)
stats += "\t%i\t%i\t%f" % (min_depth, max_depth, avg_depth)
if args.covered:
stats += "\t%i\t%i\t%f" % (min_depth_covered, max_depth,
avg_depth_covered)
stats += "\t%f\t%f" % (lci, gini_b2b)
if args.covered:
stats += "\t%f\t%f" % (lci_covered, gini_b2b_covered)
print(stats, file=handle)
# apply filters before time-consuming plotting steps
if ((avg_depth < args.min_cov)
or (avg_depth > args.max_cov)
or (lci < args.min_lci)
or (lci > args.max_lci)
or (gini_b2b < args.min_gini)
or (gini_b2b > args.max_gini)):
continue
color = 'blue'
if args.plot:
fig_cover = plt.figure(num=1,
figsize=(24, 12), # inch x inch
dpi=80, # 1 inch = 80 pixels
facecolor='white',
edgecolor='black')
# x = np.arange(depth_dist_cumperc.size) * 1.0 / max_depth
# CD = coverage distribution plot
ax_cd = fig_cover.add_subplot(2, 4, 1)
plot_cd(ax_cd, depth_dist, avg_depth)
# SCD = scaled coverage distribution plot
ax_scd = fig_cover.add_subplot(2, 4, 2)
plot_scd(ax_scd, depth_dist, avg_depth, ref_length, max_depth)
# SCD = scaled coverage distribution plot
ax_scd_covered = fig_cover.add_subplot(2, 4, 3)
plot_scd(ax_scd_covered,
np.append(np.array([0]), depth_dist[1:]),
avg_depth_covered,
ref_length - uncovered_bases,
max_depth)
# # CCD = cumulative coverage distribution plot
# ax_ccd = fig_cover.add_subplot(2, 4, 2)
# plot_ccd(ax_ccd, depth_dist_cumsum, avg_depth)
# # SCCD = scaled cumulative coverage distribution plot
# ax_sccd = fig_cover.add_subplot(2, 4, 3)
# plot_sccd(ax_sccd, depth_dist_cumperc, avg_depth)
# NCCD = normalized cumulative coverage distribution
ax_nccd = fig_cover.add_subplot(2, 4, 4)
plot_nccd(ax_nccd, depth_dist_cumperc, avg_depth, max_depth,
avg_depth_scaled, args.lci_arg, color)
ax_nccd.text(0.05, 0.95,
"lci(%.2f) = %.2f" % (args.lci_arg, lci),
transform=ax_nccd.transAxes)
# # Lorenz curve
# ax_lorenz = fig_cover.add_subplot(2, 4, 5)
# plot_lorenz(ax_lorenz, x, y)
# Lorenz curve base2base
ax_b2b = fig_cover.add_subplot(2, 4, 6)
plot_lorenz_b2b(ax_b2b, u, v)
if args.covered:
ax_b2b_covered = fig_cover.add_subplot(2, 4, 7)
plot_lorenz_b2b(ax_b2b_covered, m, n)
# save plot to png
fig_cover.tight_layout()
fig_cover.savefig(accession + "_coverage.png")
plt.close(fig_cover)
if args.read_lengths:
fig_readlength = plt.figure(num=2,
figsize=(6, 6), # inch x inch
dpi=80, # 1 inch = 80px
facecolor='white',
edgecolor='black')
# RLD = read length distribution
ax_rld = fig_readlength.add_subplot(1, 1, 1)
plot_rld(ax_rld, length_dist, min_length_total,
max_length_total, line_nr + 1, length_dist_total)
# save plot to png
fig_readlength.savefig(accession + "_readlengths.png")
plt.close(fig_readlength)
log.info("accession %s: reference has a length of %i bp and %i "
"aligned reads" % (accession, ref_length, read_count))
log.info("accession %s: coverage ranges from %ix to %ix, average "
"is %.3fx" % (accession, min_depth, max_depth, avg_depth))
if args.covered:
log.info("accession %s: coverage for covered bases only "
"ranges from %ix to %ix, average is %.3fx"
% (accession, min_depth_covered, max_depth,
avg_depth_covered))
if args.read_lengths:
log.info("accession %s: read length ranges from %i bp to %i "
"bp, average is %.3f bp" % (accession, min_length,
max_length, avg_length))
# finally print accession for later filtering steps
lines.extend(positions[accession])
if args.debug:
log.info("filtering accession: %s" % (accession,))
# keep or remove accessions from SAM file
alt = ["discarding", "keeping"]
log.info("STEP 3: %s %i unevenly covered references from SAM file." %
(alt[args.discard], len(lines)))
if buffer is None:
buffer = open(args.input, 'r')
line_filter(lines, buffer, discard=(args.discard == 0))
buffer.close()
log.info("END of filtering references by coverage, statistics saved to "
"file %s" % (stats_filename, ))
exit()
if __name__ == "__main__":
main()