"""
oh
hai
"""
number_of_codons = 21
import doctest
from Bio import SeqIO
import sys
import argparse
import json
import time
import itertools
import numpy as np
import scipy as sp
import scipy.linalg
import scipy.sparse
import scipy.sparse.linalg
from Bio import SeqIO
import pylab as p
import pprint
from sklearn import manifold
# make a codon table
bases = ['t', 'c', 'a', 'g']
codons = [a + b + c for a in bases for b in bases for c in bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
codon_table = dict( zip( codons, amino_acids ) )
amino_acids_unique = 'FLSY*CWPHQRIMTNKVADEG'
amino_table_index = dict( zip( amino_acids_unique, range( len( amino_acids_unique ) ) ) )
# make a codon index hash
index = range( 0, 64 )
codon_hist_index = dict( zip( codons, index ) )
aa_hist_index = dict( zip() )
# reverse codon table
rev_codon_table = {}
for k, v in codon_table.iteritems():
rev_codon_table[v] = rev_codon_table.get( v, [] )
rev_codon_table[v].append( k )
[docs]def load_fasta(filename):
"""
Loads a fasta file with filename as arg. Returns generator object with list of genes
Test
----
The file must exist
>>> load_fasta("yikes")
Open: yikes failed
[Errno 2] No such file or directory: 'yikes'
However, we use Biopython and the fasta file's syntax is not checked here!
>>> load_fasta("testdata_fail.ffn") # doctest: +ELLIPSIS
<generator object parse at ...>
If everything is right, a generator object is returned
>>> load_fasta("testdata.ffn") # doctest: +ELLIPSIS
<generator object parse at ...>
"""
try:
f = open(filename)
try:
fi = SeqIO.parse( f, 'fasta' )
return fi
except Exception as error:
print "import:", filename, " failed"
print error
return None
except Exception as error:
print "Open:", filename, " failed"
print error
return None
[docs]def load_genbank(filename):
"""loads a genbank file with filename as arg. Returns list of genes"""
pass
[docs]def load_plaintext(filename):
"""loads a nucleotide file with plaintext sequence. filename as arg. Returns list of genes"""
pass
[docs]def make_codon_histogram(gene):
"""returns codon_hist for one gene
The histogram should reproduce what www.kazusa.org.jp/codon/cgi-bin/countcodon.cgi computes
for the gene in testdata.ffn
UUU 12.6( 9) UCU 9.8( 7) UAU 15.4( 11) UGU 4.2( 3)
UUC 21.1( 15) UCC 16.9( 12) UAC 11.2( 8) UGC 12.6( 9)
UUA 15.4( 11) UCA 8.4( 6) UAA 0.0( 0) UGA 1.4( 1)
UUG 12.6( 9) UCG 12.6( 9) UAG 0.0( 0) UGG 5.6( 4)
CUU 8.4( 6) CCU 4.2( 3) CAU 9.8( 7) CGU 22.5( 16)
CUC 15.4( 11) CCC 5.6( 4) CAC 7.0( 5) CGC 25.3( 18)
CUA 4.2( 3) CCA 1.4( 1) CAA 11.2( 8) CGA 2.8( 2)
CUG 53.4( 38) CCG 23.9( 17) CAG 19.7( 14) CGG 5.6( 4)
AUU 36.5( 26) ACU 5.6( 4) AAU 28.1( 20) AGU 7.0( 5)
AUC 16.9( 12) ACC 23.9( 17) AAC 19.7( 14) AGC 8.4( 6)
AUA 0.0( 0) ACA 0.0( 0) AAA 26.7( 19) AGA 0.0( 0)
AUG 29.5( 21) ACG 9.8( 7) AAG 14.0( 10) AGG 1.4( 1)
GUU 25.3( 18) GCU 25.3( 18) GAU 37.9( 27) GGU 28.1( 20)
GUC 19.7( 14) GCC 32.3( 23) GAC 18.3( 13) GGC 28.1( 20)
GUA 8.4( 6) GCA 12.6( 9) GAA 47.8( 34) GGA 12.6( 9)
GUG 32.3( 23) GCG 35.1( 25) GAG 18.3( 13) GGG 9.8( 7)
in the first field. Remember, the order of codons in this package is given in the variable codons
>>> make_codon_histogram( load_fasta("testdata.ffn").next() )
([0.012640449438202247, 0.021067415730337078, 0.015449438202247191, 0.012640449438202247, 0.0098314606741573031, 0.016853932584269662, 0.0084269662921348312, 0.012640449438202247, 0.015449438202247191, 0.011235955056179775, 0.0, 0.0, 0.0042134831460674156, 0.012640449438202247, 0.0014044943820224719, 0.0056179775280898875, 0.0084269662921348312, 0.015449438202247191, 0.0042134831460674156, 0.053370786516853931, 0.0042134831460674156, 0.0056179775280898875, 0.0014044943820224719, 0.023876404494382022, 0.0098314606741573031, 0.0070224719101123594, 0.011235955056179775, 0.019662921348314606, 0.02247191011235955, 0.025280898876404494, 0.0028089887640449437, 0.0056179775280898875, 0.036516853932584269, 0.016853932584269662, 0.0, 0.029494382022471909, 0.0056179775280898875, 0.023876404494382022, 0.0, 0.0098314606741573031, 0.028089887640449437, 0.019662921348314606, 0.026685393258426966, 0.014044943820224719, 0.0070224719101123594, 0.0084269662921348312, 0.0, 0.0014044943820224719, 0.025280898876404494, 0.019662921348314606, 0.0084269662921348312, 0.032303370786516857, 0.025280898876404494, 0.032303370786516857, 0.012640449438202247, 0.0351123595505618, 0.037921348314606744, 0.018258426966292134, 0.047752808988764044, 0.018258426966292134, 0.028089887640449437, 0.028089887640449437, 0.012640449438202247, 0.0098314606741573031], [0.033707865168539325, 0.10955056179775281, 0.063202247191011238, 0.026685393258426966, 0.0014044943820224719, 0.016853932584269662, 0.0056179775280898875, 0.0351123595505618, 0.016853932584269662, 0.030898876404494381, 0.05758426966292135, 0.053370786516853931, 0.029494382022471909, 0.039325842696629212, 0.047752808988764044, 0.040730337078651688, 0.085674157303370788, 0.10533707865168539, 0.056179775280898875, 0.066011235955056174, 0.078651685393258425])
"""
codon_hist = np.array( [0] * 64 )
aa_hist = np.array( [0] * 21 )
sequence = gene.seq.tostring()
try:
triplets = [ sequence[start:start + 3] for start in range( 0, len( sequence ), 3 ) ]
except Exception as error:
print "Slicing of nucleotides into triplets failed"
print error
return None,None
for codon in triplets:
codon = codon.lower()
aa = codon_table.get( codon )
i = codon_hist_index.get( codon )
if i != None:
codon_hist[codon_hist_index.get( codon )] += 1
j = amino_table_index.get( aa )
aa_hist[j] += 1
codon_hist = map( lambda x:x / float( np.sum( codon_hist ) ), codon_hist)
aa_hist = map( lambda x:x / float( np.sum( aa_hist ) ), aa_hist)
return (codon_hist, aa_hist)
[docs]def make_codon_histogram_dic(f):
"""returns codon_hist for all genes in a fasta file in form of a dictionary with fasta identifiers as keys
The histogram should reproduce what www.kazusa.org.jp/codon/cgi-bin/countcodon.cgi for the first gene in the testdata2.ffn
UUU 12.6( 9) UCU 9.8( 7) UAU 15.4( 11) UGU 4.2( 3)
UUC 21.1( 15) UCC 16.9( 12) UAC 11.2( 8) UGC 12.6( 9)
UUA 15.4( 11) UCA 8.4( 6) UAA 0.0( 0) UGA 1.4( 1)
UUG 12.6( 9) UCG 12.6( 9) UAG 0.0( 0) UGG 5.6( 4)
CUU 8.4( 6) CCU 4.2( 3) CAU 9.8( 7) CGU 22.5( 16)
CUC 15.4( 11) CCC 5.6( 4) CAC 7.0( 5) CGC 25.3( 18)
CUA 4.2( 3) CCA 1.4( 1) CAA 11.2( 8) CGA 2.8( 2)
CUG 53.4( 38) CCG 23.9( 17) CAG 19.7( 14) CGG 5.6( 4)
AUU 36.5( 26) ACU 5.6( 4) AAU 28.1( 20) AGU 7.0( 5)
AUC 16.9( 12) ACC 23.9( 17) AAC 19.7( 14) AGC 8.4( 6)
AUA 0.0( 0) ACA 0.0( 0) AAA 26.7( 19) AGA 0.0( 0)
AUG 29.5( 21) ACG 9.8( 7) AAG 14.0( 10) AGG 1.4( 1)
GUU 25.3( 18) GCU 25.3( 18) GAU 37.9( 27) GGU 28.1( 20)
GUC 19.7( 14) GCC 32.3( 23) GAC 18.3( 13) GGC 28.1( 20)
GUA 8.4( 6) GCA 12.6( 9) GAA 47.8( 34) GGA 12.6( 9)
GUG 32.3( 23) GCG 35.1( 25) GAG 18.3( 13) GGG 9.8( 7)
in the first field. Remember, the order of codons in this package is given in the variable codons
>>> make_codon_histogram_dic( load_fasta("testdata2.ffn") )[0]
{'fid|18348942|locus|VBIEscCol44059_0001|': [0.012640449438202247, 0.021067415730337078, 0.015449438202247191, 0.012640449438202247, 0.0098314606741573031, 0.016853932584269662, 0.0084269662921348312, 0.012640449438202247, 0.015449438202247191, 0.011235955056179775, 0.0, 0.0, 0.0042134831460674156, 0.012640449438202247, 0.0014044943820224719, 0.0056179775280898875, 0.0084269662921348312, 0.015449438202247191, 0.0042134831460674156, 0.053370786516853931, 0.0042134831460674156, 0.0056179775280898875, 0.0014044943820224719, 0.023876404494382022, 0.0098314606741573031, 0.0070224719101123594, 0.011235955056179775, 0.019662921348314606, 0.02247191011235955, 0.025280898876404494, 0.0028089887640449437, 0.0056179775280898875, 0.036516853932584269, 0.016853932584269662, 0.0, 0.029494382022471909, 0.0056179775280898875, 0.023876404494382022, 0.0, 0.0098314606741573031, 0.028089887640449437, 0.019662921348314606, 0.026685393258426966, 0.014044943820224719, 0.0070224719101123594, 0.0084269662921348312, 0.0, 0.0014044943820224719, 0.025280898876404494, 0.019662921348314606, 0.0084269662921348312, 0.032303370786516857, 0.025280898876404494, 0.032303370786516857, 0.012640449438202247, 0.0351123595505618, 0.037921348314606744, 0.018258426966292134, 0.047752808988764044, 0.018258426966292134, 0.028089887640449437, 0.028089887640449437, 0.012640449438202247, 0.0098314606741573031], 'fid|129049020348348942|locus|VBIEscCol44059_0001|': [0.012640449438202247, 0.021067415730337078, 0.015449438202247191, 0.012640449438202247, 0.0098314606741573031, 0.016853932584269662, 0.0084269662921348312, 0.012640449438202247, 0.015449438202247191, 0.011235955056179775, 0.0, 0.0, 0.0042134831460674156, 0.012640449438202247, 0.0014044943820224719, 0.0056179775280898875, 0.0084269662921348312, 0.015449438202247191, 0.0042134831460674156, 0.053370786516853931, 0.0042134831460674156, 0.0056179775280898875, 0.0014044943820224719, 0.023876404494382022, 0.0098314606741573031, 0.0070224719101123594, 0.011235955056179775, 0.019662921348314606, 0.02247191011235955, 0.025280898876404494, 0.0028089887640449437, 0.0056179775280898875, 0.036516853932584269, 0.016853932584269662, 0.0, 0.029494382022471909, 0.0056179775280898875, 0.023876404494382022, 0.0, 0.0098314606741573031, 0.028089887640449437, 0.019662921348314606, 0.026685393258426966, 0.014044943820224719, 0.0070224719101123594, 0.0084269662921348312, 0.0, 0.0014044943820224719, 0.025280898876404494, 0.019662921348314606, 0.0084269662921348312, 0.032303370786516857, 0.025280898876404494, 0.032303370786516857, 0.012640449438202247, 0.0351123595505618, 0.037921348314606744, 0.018258426966292134, 0.047752808988764044, 0.018258426966292134, 0.028089887640449437, 0.028089887640449437, 0.012640449438202247, 0.0098314606741573031]}
"""
codon_hist = {}
aa_hist = {}
for gene in f:
"""returns codon_hist for each gene in a file"""
codon_hist[gene.id] = np.array( [0] * 64 )
aa_hist[gene.id] = np.array( [0] * 21 )
s = gene.seq.tostring()
triplets = [ s[start:start + 3] for start in range( 0, len( s ), 3 ) ]
for codon in triplets:
codon = codon.lower()
aa = codon_table.get( codon )
i = codon_hist_index.get( codon )
if i != None:
codon_hist[gene.id][codon_hist_index.get( codon )] += 1
j = amino_table_index.get( aa )
aa_hist[gene.id][j] += 1
codon_hist[gene.id] = map( lambda x:x / float( np.sum( codon_hist[gene.id] ) ), codon_hist[gene.id] )
aa_hist[gene.id] = map( lambda x:x / float( np.sum( aa_hist[gene.id] ) ), aa_hist[gene.id] )
return (codon_hist, aa_hist)
[docs]def calculate_RF_dic(codon_hist):
"""Calculates relative codon frequency for each gene in codon histogram
Test against run of testdata2.ffn and handcalculated for amino acid A (codons 52 - 55) should give 0.24,0.3066,0.12,0.33
>>> calculate_RF_dic( (make_codon_histogram_dic( load_fasta("testdata2.ffn")))[0] ) # doctest : +NORMALIZE_WHITESPACE
{'fid|18348942|locus|VBIEscCol44059_0001|': array([ 0.375 , 0.625 , 0.14102564, 0.11538462, 0.15555556,
0.26666667, 0.13333333, 0.2 , 0.57894737, 0.42105263,
0. , 0. , 0.25 , 0.75 , 1. ,
1. , 0.07692308, 0.14102564, 0.03846154, 0.48717949,
0.12 , 0.16 , 0.04 , 0.68 , 0.58333333,
0.41666667, 0.36363636, 0.63636364, 0.3902439 , 0.43902439,
0.04878049, 0.09756098, 0.68421053, 0.31578947, 0. ,
1. , 0.14285714, 0.60714286, 0. , 0.25 ,
0.58823529, 0.41176471, 0.65517241, 0.34482759, 0.11111111,
0.13333333, 0. , 0.02439024, 0.29508197, 0.2295082 ,
0.09836066, 0.37704918, 0.24 , 0.30666667, 0.12 ,
0.33333333, 0.675 , 0.325 , 0.72340426, 0.27659574,
0.35714286, 0.35714286, 0.16071429, 0.125 ]), 'fid|129049020348348942|locus|VBIEscCol44059_0001|': array([ 0.375 , 0.625 , 0.14102564, 0.11538462, 0.15555556,
0.26666667, 0.13333333, 0.2 , 0.57894737, 0.42105263,
0. , 0. , 0.25 , 0.75 , 1. ,
1. , 0.07692308, 0.14102564, 0.03846154, 0.48717949,
0.12 , 0.16 , 0.04 , 0.68 , 0.58333333,
0.41666667, 0.36363636, 0.63636364, 0.3902439 , 0.43902439,
0.04878049, 0.09756098, 0.68421053, 0.31578947, 0. ,
1. , 0.14285714, 0.60714286, 0. , 0.25 ,
0.58823529, 0.41176471, 0.65517241, 0.34482759, 0.11111111,
0.13333333, 0. , 0.02439024, 0.29508197, 0.2295082 ,
0.09836066, 0.37704918, 0.24 , 0.30666667, 0.12 ,
0.33333333, 0.675 , 0.325 , 0.72340426, 0.27659574,
0.35714286, 0.35714286, 0.16071429, 0.125 ])}
"""
hist_rf = {}
for gene in codon_hist:
hist_rf[gene] = np.array( [0.] * 64 )
for aminoacid in rev_codon_table:
entropy = 0
n = 0.
new_norm = 0.
for codon in rev_codon_table[aminoacid]:
new_norm += codon_hist[gene][codon_hist_index.get( codon )]
for codon in rev_codon_table[aminoacid]:
if new_norm > 0:
pi = codon_hist[gene][codon_hist_index.get( codon )] / new_norm
hist_rf[gene][codon_hist_index.get( codon )] = codon_hist[gene][codon_hist_index.get( codon )] / new_norm
if pi > 0.0:
entropy += pi * np.log2( pi )
n += 1.
return hist_rf
[docs]def calculate_RF(codon_hist):
"""Calculates relative codon frequency for a codon histogram"""
print 'not implemented yet'
[docs]def calculate_RSCU_dic(codon_hist):
"""returns codon_rscu for each gene in codon histogram
test like calculate_rf and checked against genomes.urv.es/optimizer
>>> calculate_RSCU_dic( (make_codon_histogram_dic( load_fasta("testdata2.ffn")))[0] )
{'fid|18348942|locus|VBIEscCol44059_0001|': array([ 0.75 , 1.25 , 0.84615385, 0.69230769, 0.93333333,
1.6 , 0.8 , 1.2 , 1.15789474, 0.84210526,
0. , 0. , 0.5 , 1.5 , 3. ,
1. , 0.46153846, 0.84615385, 0.23076923, 2.92307692,
0.48 , 0.64 , 0.16 , 2.72 , 1.16666667,
0.83333333, 0.72727273, 1.27272727, 2.34146341, 2.63414634,
0.29268293, 0.58536585, 2.05263158, 0.94736842, 0. ,
1. , 0.57142857, 2.42857143, 0. , 1. ,
1.17647059, 0.82352941, 1.31034483, 0.68965517, 0.66666667,
0.8 , 0. , 0.14634146, 1.18032787, 0.91803279,
0.39344262, 1.50819672, 0.96 , 1.22666667, 0.48 ,
1.33333333, 1.35 , 0.65 , 1.44680851, 0.55319149,
1.42857143, 1.42857143, 0.64285714, 0.5 ]), 'fid|129049020348348942|locus|VBIEscCol44059_0001|': array([ 0.75 , 1.25 , 0.84615385, 0.69230769, 0.93333333,
1.6 , 0.8 , 1.2 , 1.15789474, 0.84210526,
0. , 0. , 0.5 , 1.5 , 3. ,
1. , 0.46153846, 0.84615385, 0.23076923, 2.92307692,
0.48 , 0.64 , 0.16 , 2.72 , 1.16666667,
0.83333333, 0.72727273, 1.27272727, 2.34146341, 2.63414634,
0.29268293, 0.58536585, 2.05263158, 0.94736842, 0. ,
1. , 0.57142857, 2.42857143, 0. , 1. ,
1.17647059, 0.82352941, 1.31034483, 0.68965517, 0.66666667,
0.8 , 0. , 0.14634146, 1.18032787, 0.91803279,
0.39344262, 1.50819672, 0.96 , 1.22666667, 0.48 ,
1.33333333, 1.35 , 0.65 , 1.44680851, 0.55319149,
1.42857143, 1.42857143, 0.64285714, 0.5 ])}
"""
hist_rscu = {}
for gene in codon_hist:
hist_rscu[gene] = np.array( [0.] * 64 )
for aminoacid in rev_codon_table:
entropy = 0
n = 0.
new_norm = 0.
for codon in rev_codon_table[aminoacid]:
new_norm += codon_hist[gene][codon_hist_index.get( codon )]
for codon in rev_codon_table[aminoacid]:
if new_norm > 0:
pi = codon_hist[gene][codon_hist_index.get( codon )] / new_norm
hist_rscu[gene][codon_hist_index.get( codon )] = len(rev_codon_table[aminoacid]) * codon_hist[gene][codon_hist_index.get( codon )] / new_norm
if pi > 0.0:
entropy += pi * np.log2( pi )
n += 1.
return hist_rscu
[docs]def calculate_RSCU(codon_hist):
"""returns codon_rscu for a gene"""
hist_rscu = {}
hist_rscu = np.array( [0.] * 64 )
for aminoacid in rev_codon_table:
entropy = 0
n = 0.
new_norm = 0.
for codon in rev_codon_table[aminoacid]:
new_norm += codon_hist[codon_hist_index.get( codon )]
for codon in rev_codon_table[aminoacid]:
if new_norm > 0:
pi = codon_hist[codon_hist_index.get( codon )] / new_norm
hist_rscu[codon_hist_index.get( codon )] = len(rev_codon_table[aminoacid])*codon_hist[codon_hist_index.get( codon )] / new_norm
if pi > 0.0:
entropy += pi * np.log2( pi )
n += 1.
return hist_rscu
def calculate_CAI():
pass
def calculate_NC():
pass
[docs]def highly_expressed_genes_by_file(filename,hist):
"""loads a list of highly expressed genes and returns and returns an index where 0 if no heg and 1 if heg is returned.
the format used is that from ecai/heg and it tries to find the id in the description of the histogram keys
"""
with open( 'ecoli3.heg.txt' ) as f:
data = [ line.split() for line in f ]
heg = [x[5] for x in ( data[3:] ) if len( x ) > 1]
is_heg = [False] * len( hist )
i = 0
for gene in hist:
for test_heg in heg:
if test_heg in gene:
is_heg[i] = True
i += 1
return is_heg
[docs]def highly_expressed_genes_by_id(id_list,hist):
heg = id_list
is_heg = [False] * len( hist )
i = 0
for gene in hist:
for test_heg in heg:
if test_heg in gene:
is_heg[i] = True
i += 1
return is_heg
def mds(hist,n_components=2,max_iter=300,n_init=8,n_jobs=-1):
X = np.array( hist.values() )
mds = manifold.MDS( n_components, max_iter = 300, n_init = 8, n_jobs = -1 )
Y = mds.fit_transform( X )
return Y
def plot_mds(Y,n_components=2):
pass
def mds_from_fasta(filename):
fasta = load_fasta(filename)
codon_dic,aa_dic = make_codon_histogram_dic(fasta)
rscu = calculate_RSCU_dic(codon_dic)
Y=mds(rscu)
[docs]def setup_parser():
"""Setting up the parser for command line usage, returns args"""
usage = """%(prog)s <functional argument> <output target argument>"""
description = """Codon Bias Simulation"""
parser = argparse.ArgumentParser( usage = usage, description = description )
parser.add_argument( '-f', '--gene', action = 'store', nargs = 1, dest = 'experimental_data_filenames', help = 'File which contains Codon sequence of Genes which are to be used for calculating the amino acid frequencies' )
parser.add_argument( '-v', '--verbose', action = 'store_true', dest = 'verbose', default = False )
parser.add_argument( '-s', '--fitness_matrix', action = 'store', nargs = 1, dest = 'fitness_matrix_names', help = 'Fitness matrix, e.g. amino-acid similarity' )
parser.add_argument( '-i', '--fitness_functions', action = 'append', dest = 'fitness_functions_filenames', help = 'Fitness functions' )
parser.add_argument( '-c', '--configfile', action = 'store', dest = 'config_filename', help = 'Configuration file' )
sys.argv.append( config )
args = parser.parse_args()
if args.config_filename is not None:
if "jsn" in args.config_filename:
with json.load( open( args.config_filename )) as config:
args.experimental_data_filenames = config["experimental_data_filenames"]
args.fitness_functions_filenames = config["fitness_functions_filenames"]
args.fitness_matrix_names = config["fitness_matrix_names"]
else:
print 'other config file formats not supported, use *.jsn ending please'
raise Exception
return args
if __name__ == "__main__":
doctest.testmod()