"""
Brief
-----
A library for analysing codon usage bias with the quasispecies model.
Summary
-------
Routines
--------
CodonUsage
Fitnessfunction
Model
__builtins__
__doc__
__file__
__name__
__package__
c_codon_mut_dist
calculate_CAI_dic
calculate_NC
calculate_RF
calculate_RF_dic
calculate_RSCU
calculate_RSCU_dic
change_amino_acid_code
codon_hist_index
codon_index
codon_mut_dist
codon_table
codons
compute_distance
compute_optimum
compute_optimum_two_step
compute_steady_state
config_from_file
decomposition
euclidean_distance
hessian_em
highly_expressed_genes_by_file
highly_expressed_genes_by_id
init_fitnessfunctions
init_fitnessmatrix_for_amino_acids
init_fitnessmatrix_for_codons
init_list_of_genes
is_ambig_codon
kl_distance
load_fasta
load_fasta_from_url
load_genbank
load_plaintext
make_codon_histogram
make_codon_histogram_dic
make_codon_histogram_dic_combined
make_evolmatrix
make_jc69_mutationmatrix
make_mutationmatrix
mds_from_fasta
minimize
number_of_codons
optimize
optimize_dic
optimize_sequence
parametric_run_from_config
plot_all_reduction_methods
plot_mds
plot_pca
py_codon_mut_dist
relative_cosine_distance
relative_euclidean_distance
relative_hellinger_distance
relative_jeffrey_divergence
relative_minkowski_distance
remove_stopcodons_and_one_codon_amino_acids
remove_stopcodons_and_one_codon_amino_acids_dic
run_model
sample_run
setup_parser
transition
Examples
--------
"""
number_of_codons = 21
import doctest
from Bio import SeqIO
import sys
import argparse
import json
import time
import itertools
import numpy as np
import scipy as sp
import scipy.linalg
import scipy.sparse
import scipy.sparse.linalg
import scipy.spatial
from Bio import SeqIO
import pylab as p
import pprint
from sklearn import manifold
from sklearn import decomposition
import scipy
from scipy import weave
import numpy as np
import collections
import urllib2
# make a codon table
bases = ['t', 'c', 'a', 'g']
codons = [a + b + c for a in bases for b in bases for c in bases]
#from ncbi genetic codes
genetic_codes = collections.OrderedDict({})
genetic_codes['The Standard Code'] = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['The Vertebrate Mitochondrial Code'] = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG'
genetic_codes['The Yeast Mitochondrial Code'] = 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['The Mold, Protozoan, and Coelenterate Mitochondrial Code'] = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['The Invertebrate Mitochondrial Code'] = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG'
genetic_codes['The Ciliate, Dasycladacean and Hexamita Nuclear Code'] = 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['The Echinoderm and Flatworm Mitochondrial Code'] = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
genetic_codes['The Euplotid Nuclear Code'] = 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['The Bacterial, Archaeal and Plant Plastid Code'] = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['The Alternative Yeast Nuclear Code'] = 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['The Ascidian Mitochondrial Code'] = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG'
genetic_codes['The Alternative Flatworm Mitochondrial Code'] = 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
genetic_codes['The Blepharisma Nuclear Code'] = 'FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['Chlorophycean Mitochondrial Code'] = 'FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['Trematode Mitochondrial Code'] = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
genetic_codes['Scenedesmus obliquus mitochondrial Code'] = 'FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['Thraustochytrium Mitochondrial Code'] = 'FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
genetic_codes['Pterobranchia mitochondrial code'] = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG'
#amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
#amino_acids_unique = 'FLSY*CWPHQRIMTNKVADEG'
amino_acids = genetic_codes['The Standard Code']
amino_acids_unique = ''.join( [ amino_acids[i] for i in range(len(amino_acids)) if amino_acids.index(amino_acids[i]) == i ] )
ambig_fasta_chars = list('RYKMSWBDHVNX-'.lower())
codon_table = dict( zip( codons, amino_acids ) )
amino_table_index = dict( zip( amino_acids_unique, range( len( amino_acids_unique ) ) ) )
# make a codon index hash
index = range( 0, 64 )
codon_hist_index = dict( zip( codons, index ) )
aa_hist_index = dict( zip() )
# reverse codon table
rev_codon_table = {}
for k, v in codon_table.iteritems():
rev_codon_table[v] = rev_codon_table.get( v, [] )
rev_codon_table[v].append( k )
codon_index = dict( zip( codons, range( len( codons ) ) ) )
[docs]def is_ambig_codon(codon):
global ambig_fasta_chars
for letter in codon:
if letter in ambig_fasta_chars:
return True
return False
[docs]def change_amino_acid_code(code_name):
global amino_acids
global amino_acids_unique
global codon_table
global amino_table_index
global codon_hist_index
global aa_hist_index
global rev_codon_table
global codon_index
if not genetic_codes.has_key(code_name):
print 'genetic code not valid!'
return 0
amino_acids = genetic_codes[code_name]
amino_acids_unique = ''.join( [ amino_acids[i] for i in range(len(amino_acids)) if amino_acids.index(amino_acids[i]) == i ] )
codon_table = dict( zip( codons, amino_acids ) )
amino_table_index = dict( zip( amino_acids_unique, range( len( amino_acids_unique ) ) ) )
# make a codon index hash
index = range( 0, 64 )
codon_hist_index = dict( zip( codons, index ) )
aa_hist_index = dict( zip() )
# reverse codon table
rev_codon_table = {}
for k, v in codon_table.iteritems():
rev_codon_table[v] = rev_codon_table.get( v, [] )
rev_codon_table[v].append( k )
codon_index = dict( zip( codons, range( len( codons ) ) ) )
return 1
[docs]def load_fasta_from_url(url):
req=urllib2.Request(url)
try:
f = urllib2.urlopen(req)
except urllib2.URLError as e:
if hasattr(e, 'reason'):
print 'We failed to reach a server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
else:
try:
fi = SeqIO.parse( f, 'fasta' )
return fi
except Exception as error:
print "import:", filename, " failed"
print error
return None
[docs]def load_fasta(filename):
"""Loads a fasta file with filename as arg. Returns generator object with list of genes
Parameters
----------
filename : str
Path to the file to load
Returns
-------
fasta : Seq
Parser from Biopython
Examples
--------
If everything is right, a generator object is returned
>>> load_fasta("testdata.ffn") # doctest: +ELLIPSIS
<generator object parse at ...>
The file must exist
>>> load_fasta("yikes")
Open: yikes failed
[Errno 2] No such file or directory: 'yikes'
However, we use Biopython and the fasta file's syntax is not checked here!
>>> load_fasta("testdata_fail.ffn") # doctest: +ELLIPSIS
<generator object parse at ...>
"""
try:
f = open(filename)
try:
fi = SeqIO.parse( f, 'fasta' )
return fi
except Exception as error:
print "import:", filename, " failed"
print error
return None
except Exception as error:
print "Open:", filename, " failed"
print error
return None
[docs]def load_genbank(filename):
"""loads a genbank file with filename as arg. RReturns generator object with list of gene
Parameters
----------
filename : str
Path to the file to load
Returns
-------
genes : Seq
Parser from Biopython
Examples
--------
"""
try:
f = open(filename)
try:
fi = SeqIO.parse( f, 'genbank' )
return fi
except Exception as error:
print "import:", filename, " failed"
print error
return None
except Exception as error:
print "Open:", filename, " failed"
print error
return None
return fi
[docs]def load_plaintext(filename):
"""loads a nucleotide file with plaintext sequence. filename as arg. Returns list of genes"""
print 'not implemented yet'
pass
[docs]def make_codon_histogram(gene):
"""returns codon_hist,aa_hist for one gene
Parameters
----------
gene : SeqIO
after loading fasta
Returns
-------
codon_hist : dict with each gene.id as key and num_codonsx1 np.array for each codon
aa_hist : dict with each gene.id as key and num_aax1 np.array for each aacid
Examples
--------
The histogram should reproduce what www.kazusa.org.jp/codon/cgi-bin/countcodon.cgi computes
for the gene in testdata.ffn::
UUU 12.6( 9) UCU 9.8( 7) UAU 15.4( 11) UGU 4.2( 3)
UUC 21.1( 15) UCC 16.9( 12) UAC 11.2( 8) UGC 12.6( 9)
UUA 15.4( 11) UCA 8.4( 6) UAA 0.0( 0) UGA 1.4( 1)
UUG 12.6( 9) UCG 12.6( 9) UAG 0.0( 0) UGG 5.6( 4)
CUU 8.4( 6) CCU 4.2( 3) CAU 9.8( 7) CGU 22.5( 16)
CUC 15.4( 11) CCC 5.6( 4) CAC 7.0( 5) CGC 25.3( 18)
CUA 4.2( 3) CCA 1.4( 1) CAA 11.2( 8) CGA 2.8( 2)
CUG 53.4( 38) CCG 23.9( 17) CAG 19.7( 14) CGG 5.6( 4)
AUU 36.5( 26) ACU 5.6( 4) AAU 28.1( 20) AGU 7.0( 5)
AUC 16.9( 12) ACC 23.9( 17) AAC 19.7( 14) AGC 8.4( 6)
AUA 0.0( 0) ACA 0.0( 0) AAA 26.7( 19) AGA 0.0( 0)
AUG 29.5( 21) ACG 9.8( 7) AAG 14.0( 10) AGG 1.4( 1)
GUU 25.3( 18) GCU 25.3( 18) GAU 37.9( 27) GGU 28.1( 20)
GUC 19.7( 14) GCC 32.3( 23) GAC 18.3( 13) GGC 28.1( 20)
GUA 8.4( 6) GCA 12.6( 9) GAA 47.8( 34) GGA 12.6( 9)
GUG 32.3( 23) GCG 35.1( 25) GAG 18.3( 13) GGG 9.8( 7)
in the first field. Remember, the order of codons in this package is given in the variable codons
>>> make_codon_histogram( load_fasta("testdata.ffn").next() )
([0.012640449438202247, 0.021067415730337078, 0.015449438202247191, 0.012640449438202247, 0.0098314606741573031, 0.016853932584269662, 0.0084269662921348312, 0.012640449438202247, 0.015449438202247191, 0.011235955056179775, 0.0, 0.0, 0.0042134831460674156, 0.012640449438202247, 0.0014044943820224719, 0.0056179775280898875, 0.0084269662921348312, 0.015449438202247191, 0.0042134831460674156, 0.053370786516853931, 0.0042134831460674156, 0.0056179775280898875, 0.0014044943820224719, 0.023876404494382022, 0.0098314606741573031, 0.0070224719101123594, 0.011235955056179775, 0.019662921348314606, 0.02247191011235955, 0.025280898876404494, 0.0028089887640449437, 0.0056179775280898875, 0.036516853932584269, 0.016853932584269662, 0.0, 0.029494382022471909, 0.0056179775280898875, 0.023876404494382022, 0.0, 0.0098314606741573031, 0.028089887640449437, 0.019662921348314606, 0.026685393258426966, 0.014044943820224719, 0.0070224719101123594, 0.0084269662921348312, 0.0, 0.0014044943820224719, 0.025280898876404494, 0.019662921348314606, 0.0084269662921348312, 0.032303370786516857, 0.025280898876404494, 0.032303370786516857, 0.012640449438202247, 0.0351123595505618, 0.037921348314606744, 0.018258426966292134, 0.047752808988764044, 0.018258426966292134, 0.028089887640449437, 0.028089887640449437, 0.012640449438202247, 0.0098314606741573031], [0.033707865168539325, 0.10955056179775281, 0.063202247191011238, 0.026685393258426966, 0.0014044943820224719, 0.016853932584269662, 0.0056179775280898875, 0.0351123595505618, 0.016853932584269662, 0.030898876404494381, 0.05758426966292135, 0.053370786516853931, 0.029494382022471909, 0.039325842696629212, 0.047752808988764044, 0.040730337078651688, 0.085674157303370788, 0.10533707865168539, 0.056179775280898875, 0.066011235955056174, 0.078651685393258425])
"""
codon_hist = np.array( [0] * 64 )
aa_hist = np.array( [0] * 21 )
sequence = gene.seq.tostring()
try:
triplets = [ sequence[start:start + 3] for start in range( 0, len( sequence ), 3 ) ]
except Exception as error:
print "Slicing of nucleotides into triplets failed"
print error
return None,None
for codon in triplets:
codon = codon.lower()
if is_ambig_codon(codon):
continue
aa = codon_table.get( codon )
i = codon_hist_index.get( codon )
if i != None:
codon_hist[codon_hist_index.get( codon )] += 1
j = amino_table_index.get( aa )
aa_hist[j] += 1
codon_hist = map( lambda x:x / float( np.sum( codon_hist ) ), codon_hist)
aa_hist = map( lambda x:x / float( np.sum( aa_hist ) ), aa_hist)
return (codon_hist, aa_hist)
[docs]def make_codon_histogram_dic_combined(f):
codon_hist = {}
aa_hist = {}
codon_hist['combined genome'] = np.array( [0] * 64 )
aa_hist['combined genome'] = np.array( [0] * 21 )
for gene in f:
s = gene.seq.tostring()
triplets = [ s[start:start + 3] for start in range( 0, len( s ), 3 ) ]
for codon in triplets:
codon = codon.lower()
if is_ambig_codon(codon):
continue
aa = codon_table.get( codon )
i = codon_hist_index.get( codon )
if i != None:
codon_hist['combined genome'][codon_hist_index.get( codon )] += 1
j = amino_table_index.get( aa )
aa_hist['combined genome'][j] += 1
codon_hist['combined genome'] = map( lambda x:x / float( np.sum( codon_hist['combined genome'] ) ), codon_hist['combined genome'] )
aa_hist['combined genome'] = map( lambda x:x / float( np.sum( aa_hist['combined genome'] ) ), aa_hist['combined genome'] )
return (codon_hist, aa_hist)
[docs]def make_codon_histogram_dic(f):
"""returns codon_hist for all genes in a fasta file in form of a dictionary with fasta identifiers as keys
The histogram should reproduce what www.kazusa.org.jp/codon/cgi-bin/countcodon.cgi for the first gene in the testdata2.ffn::
UUU 12.6( 9) UCU 9.8( 7) UAU 15.4( 11) UGU 4.2( 3)
UUC 21.1( 15) UCC 16.9( 12) UAC 11.2( 8) UGC 12.6( 9)
UUA 15.4( 11) UCA 8.4( 6) UAA 0.0( 0) UGA 1.4( 1)
UUG 12.6( 9) UCG 12.6( 9) UAG 0.0( 0) UGG 5.6( 4)
CUU 8.4( 6) CCU 4.2( 3) CAU 9.8( 7) CGU 22.5( 16)
CUC 15.4( 11) CCC 5.6( 4) CAC 7.0( 5) CGC 25.3( 18)
CUA 4.2( 3) CCA 1.4( 1) CAA 11.2( 8) CGA 2.8( 2)
CUG 53.4( 38) CCG 23.9( 17) CAG 19.7( 14) CGG 5.6( 4)
AUU 36.5( 26) ACU 5.6( 4) AAU 28.1( 20) AGU 7.0( 5)
AUC 16.9( 12) ACC 23.9( 17) AAC 19.7( 14) AGC 8.4( 6)
AUA 0.0( 0) ACA 0.0( 0) AAA 26.7( 19) AGA 0.0( 0)
AUG 29.5( 21) ACG 9.8( 7) AAG 14.0( 10) AGG 1.4( 1)
GUU 25.3( 18) GCU 25.3( 18) GAU 37.9( 27) GGU 28.1( 20)
GUC 19.7( 14) GCC 32.3( 23) GAC 18.3( 13) GGC 28.1( 20)
GUA 8.4( 6) GCA 12.6( 9) GAA 47.8( 34) GGA 12.6( 9)
GUG 32.3( 23) GCG 35.1( 25) GAG 18.3( 13) GGG 9.8( 7)
in the first field. Remember, the order of codons in this package is given in the variable codons
>>> make_codon_histogram_dic( load_fasta("testdata2.ffn") )[0]
{'fid|18348942|locus|VBIEscCol44059_0001|': [0.012640449438202247, 0.021067415730337078, 0.015449438202247191, 0.012640449438202247, 0.0098314606741573031, 0.016853932584269662, 0.0084269662921348312, 0.012640449438202247, 0.015449438202247191, 0.011235955056179775, 0.0, 0.0, 0.0042134831460674156, 0.012640449438202247, 0.0014044943820224719, 0.0056179775280898875, 0.0084269662921348312, 0.015449438202247191, 0.0042134831460674156, 0.053370786516853931, 0.0042134831460674156, 0.0056179775280898875, 0.0014044943820224719, 0.023876404494382022, 0.0098314606741573031, 0.0070224719101123594, 0.011235955056179775, 0.019662921348314606, 0.02247191011235955, 0.025280898876404494, 0.0028089887640449437, 0.0056179775280898875, 0.036516853932584269, 0.016853932584269662, 0.0, 0.029494382022471909, 0.0056179775280898875, 0.023876404494382022, 0.0, 0.0098314606741573031, 0.028089887640449437, 0.019662921348314606, 0.026685393258426966, 0.014044943820224719, 0.0070224719101123594, 0.0084269662921348312, 0.0, 0.0014044943820224719, 0.025280898876404494, 0.019662921348314606, 0.0084269662921348312, 0.032303370786516857, 0.025280898876404494, 0.032303370786516857, 0.012640449438202247, 0.0351123595505618, 0.037921348314606744, 0.018258426966292134, 0.047752808988764044, 0.018258426966292134, 0.028089887640449437, 0.028089887640449437, 0.012640449438202247, 0.0098314606741573031], 'fid|129049020348348942|locus|VBIEscCol44059_0001|': [0.012640449438202247, 0.021067415730337078, 0.015449438202247191, 0.012640449438202247, 0.0098314606741573031, 0.016853932584269662, 0.0084269662921348312, 0.012640449438202247, 0.015449438202247191, 0.011235955056179775, 0.0, 0.0, 0.0042134831460674156, 0.012640449438202247, 0.0014044943820224719, 0.0056179775280898875, 0.0084269662921348312, 0.015449438202247191, 0.0042134831460674156, 0.053370786516853931, 0.0042134831460674156, 0.0056179775280898875, 0.0014044943820224719, 0.023876404494382022, 0.0098314606741573031, 0.0070224719101123594, 0.011235955056179775, 0.019662921348314606, 0.02247191011235955, 0.025280898876404494, 0.0028089887640449437, 0.0056179775280898875, 0.036516853932584269, 0.016853932584269662, 0.0, 0.029494382022471909, 0.0056179775280898875, 0.023876404494382022, 0.0, 0.0098314606741573031, 0.028089887640449437, 0.019662921348314606, 0.026685393258426966, 0.014044943820224719, 0.0070224719101123594, 0.0084269662921348312, 0.0, 0.0014044943820224719, 0.025280898876404494, 0.019662921348314606, 0.0084269662921348312, 0.032303370786516857, 0.025280898876404494, 0.032303370786516857, 0.012640449438202247, 0.0351123595505618, 0.037921348314606744, 0.018258426966292134, 0.047752808988764044, 0.018258426966292134, 0.028089887640449437, 0.028089887640449437, 0.012640449438202247, 0.0098314606741573031]}
"""
codon_hist = {}
aa_hist = {}
for gene in f:
"""returns codon_hist for each gene in a file"""
codon_hist[gene.id] = np.array( [0] * 64 )
aa_hist[gene.id] = np.array( [0] * 21 )
s = gene.seq.tostring()
triplets = [ s[start:start + 3] for start in range( 0, len( s ), 3 ) ]
for codon in triplets:
codon = codon.lower()
if is_ambig_codon(codon):
continue
aa = codon_table.get( codon )
i = codon_hist_index.get( codon )
if i != None:
codon_hist[gene.id][codon_hist_index.get( codon )] += 1
j = amino_table_index.get( aa )
aa_hist[gene.id][j] += 1
codon_hist[gene.id] = map( lambda x:x / float( np.sum( codon_hist[gene.id] ) ), codon_hist[gene.id] )
aa_hist[gene.id] = map( lambda x:x / float( np.sum( aa_hist[gene.id] ) ), aa_hist[gene.id] )
return (codon_hist, aa_hist)
[docs]def calculate_RF_dic(codon_hist):
"""Calculates relative codon frequency for each gene in codon histogram
Test against run of testdata2.ffn and handcalculated for amino acid A (codons 52 - 55) should give 0.24,0.3066,0.12,0.33
>>> calculate_RF_dic( (make_codon_histogram_dic( load_fasta("testdata2.ffn")))[0] ) # doctest : +NORMALIZE_WHITESPACE
{'fid|18348942|locus|VBIEscCol44059_0001|': array([ 0.375 , 0.625 , 0.14102564, 0.11538462, 0.15555556,
0.26666667, 0.13333333, 0.2 , 0.57894737, 0.42105263,
0. , 0. , 0.25 , 0.75 , 1. ,
1. , 0.07692308, 0.14102564, 0.03846154, 0.48717949,
0.12 , 0.16 , 0.04 , 0.68 , 0.58333333,
0.41666667, 0.36363636, 0.63636364, 0.3902439 , 0.43902439,
0.04878049, 0.09756098, 0.68421053, 0.31578947, 0. ,
1. , 0.14285714, 0.60714286, 0. , 0.25 ,
0.58823529, 0.41176471, 0.65517241, 0.34482759, 0.11111111,
0.13333333, 0. , 0.02439024, 0.29508197, 0.2295082 ,
0.09836066, 0.37704918, 0.24 , 0.30666667, 0.12 ,
0.33333333, 0.675 , 0.325 , 0.72340426, 0.27659574,
0.35714286, 0.35714286, 0.16071429, 0.125 ]), 'fid|129049020348348942|locus|VBIEscCol44059_0001|': array([ 0.375 , 0.625 , 0.14102564, 0.11538462, 0.15555556,
0.26666667, 0.13333333, 0.2 , 0.57894737, 0.42105263,
0. , 0. , 0.25 , 0.75 , 1. ,
1. , 0.07692308, 0.14102564, 0.03846154, 0.48717949,
0.12 , 0.16 , 0.04 , 0.68 , 0.58333333,
0.41666667, 0.36363636, 0.63636364, 0.3902439 , 0.43902439,
0.04878049, 0.09756098, 0.68421053, 0.31578947, 0. ,
1. , 0.14285714, 0.60714286, 0. , 0.25 ,
0.58823529, 0.41176471, 0.65517241, 0.34482759, 0.11111111,
0.13333333, 0. , 0.02439024, 0.29508197, 0.2295082 ,
0.09836066, 0.37704918, 0.24 , 0.30666667, 0.12 ,
0.33333333, 0.675 , 0.325 , 0.72340426, 0.27659574,
0.35714286, 0.35714286, 0.16071429, 0.125 ])}
"""
hist_rf = {}
for gene in codon_hist:
hist_rf[gene] = np.array( [0.] * 64 )
for aminoacid in rev_codon_table:
entropy = 0
n = 0.
new_norm = 0.
for codon in rev_codon_table[aminoacid]:
new_norm += codon_hist[gene][codon_hist_index.get( codon )]
for codon in rev_codon_table[aminoacid]:
if new_norm > 0:
pi = codon_hist[gene][codon_hist_index.get( codon )] / new_norm
hist_rf[gene][codon_hist_index.get( codon )] = codon_hist[gene][codon_hist_index.get( codon )] / new_norm
if pi > 0.0:
entropy += pi * np.log2( pi )
n += 1.
return hist_rf
[docs]def calculate_RF(codon_hist):
"""Calculates relative codon frequency for a codon"""
#lazy version until i have time... :/
testdic = {}
testdic['test'] = codon_hist
results = calculate_RF_dic(testdic)['test']
return results
[docs]def calculate_RSCU_dic(codon_hist):
"""returns codon_rscu for each gene in codon histogram
test like calculate_rf and checked against genomes.urv.es/optimizer
>>> calculate_RSCU_dic( (make_codon_histogram_dic( load_fasta("testdata2.ffn")))[0] )
{'fid|18348942|locus|VBIEscCol44059_0001|': array([ 0.75 , 1.25 , 0.84615385, 0.69230769, 0.93333333,
1.6 , 0.8 , 1.2 , 1.15789474, 0.84210526,
0. , 0. , 0.5 , 1.5 , 3. ,
1. , 0.46153846, 0.84615385, 0.23076923, 2.92307692,
0.48 , 0.64 , 0.16 , 2.72 , 1.16666667,
0.83333333, 0.72727273, 1.27272727, 2.34146341, 2.63414634,
0.29268293, 0.58536585, 2.05263158, 0.94736842, 0. ,
1. , 0.57142857, 2.42857143, 0. , 1. ,
1.17647059, 0.82352941, 1.31034483, 0.68965517, 0.66666667,
0.8 , 0. , 0.14634146, 1.18032787, 0.91803279,
0.39344262, 1.50819672, 0.96 , 1.22666667, 0.48 ,
1.33333333, 1.35 , 0.65 , 1.44680851, 0.55319149,
1.42857143, 1.42857143, 0.64285714, 0.5 ]), 'fid|129049020348348942|locus|VBIEscCol44059_0001|': array([ 0.75 , 1.25 , 0.84615385, 0.69230769, 0.93333333,
1.6 , 0.8 , 1.2 , 1.15789474, 0.84210526,
0. , 0. , 0.5 , 1.5 , 3. ,
1. , 0.46153846, 0.84615385, 0.23076923, 2.92307692,
0.48 , 0.64 , 0.16 , 2.72 , 1.16666667,
0.83333333, 0.72727273, 1.27272727, 2.34146341, 2.63414634,
0.29268293, 0.58536585, 2.05263158, 0.94736842, 0. ,
1. , 0.57142857, 2.42857143, 0. , 1. ,
1.17647059, 0.82352941, 1.31034483, 0.68965517, 0.66666667,
0.8 , 0. , 0.14634146, 1.18032787, 0.91803279,
0.39344262, 1.50819672, 0.96 , 1.22666667, 0.48 ,
1.33333333, 1.35 , 0.65 , 1.44680851, 0.55319149,
1.42857143, 1.42857143, 0.64285714, 0.5 ])}
"""
hist_rscu = {}
for gene in codon_hist:
hist_rscu[gene] = np.array( [0.] * 64 )
for aminoacid in rev_codon_table:
entropy = 0
n = 0.
new_norm = 0.
for codon in rev_codon_table[aminoacid]:
new_norm += codon_hist[gene][codon_hist_index.get( codon )]
for codon in rev_codon_table[aminoacid]:
if new_norm > 0:
pi = codon_hist[gene][codon_hist_index.get( codon )] / new_norm
hist_rscu[gene][codon_hist_index.get( codon )] = len(rev_codon_table[aminoacid]) * codon_hist[gene][codon_hist_index.get( codon )] / new_norm
if pi > 0.0:
entropy += pi * np.log2( pi )
n += 1.
return hist_rscu
[docs]def calculate_RSCU(codon_hist):
"""returns codon_rscu for a gene"""
hist_rscu = {}
hist_rscu = np.array( [0.] * 64 )
for aminoacid in rev_codon_table:
entropy = 0
n = 0.
new_norm = 0.
for codon in rev_codon_table[aminoacid]:
new_norm += codon_hist[codon_hist_index.get( codon )]
for codon in rev_codon_table[aminoacid]:
if new_norm > 0:
pi = codon_hist[codon_hist_index.get( codon )] / new_norm
hist_rscu[codon_hist_index.get( codon )] = len(rev_codon_table[aminoacid])*codon_hist[codon_hist_index.get( codon )] / new_norm
if pi > 0.0:
entropy += pi * np.log2( pi )
n += 1.
return hist_rscu
from Bio.SeqUtils import CodonUsage
import Bio.SeqUtils.CodonUsageIndices
def calculate_CAI_dic(fasta_filename,heg_fasta=None,reference=Bio.SeqUtils.CodonUsageIndices.SharpEcoliIndex):
fasta = load_fasta(fasta_filename)
CAI = CodonUsage.CodonAdaptationIndex()
if heg_fasta is not None:
try:
CAI.generate_index(heg_fasta)
except Exception:
CAI.set_cai_index(reference)
else:
CAI.set_cai_index(reference)
cais = {}
for gene in fasta:
try:
try:
sequence = gene.seq.tostring()
triplets = [ sequence[start:start + 3] for start in range( 0, len( sequence ), 3 ) ]
except Exception as error:
print "Slicing of nucleotides into triplets failed"
print error
return None,None
sequence = ''.join( filter(lambda codon: not is_ambig_codon(codon.lower()) , triplets) )
cais[gene.id] = CAI.cai_for_gene(sequence)
except Exception as e:
cais[gene.id] = 0.
print e
return cais
def calculate_NC(codon_hist):
pass
[docs]def highly_expressed_genes_by_file(filename,hist):
"""loads a list of highly expressed genes and returns and returns an index where 0 if no heg and 1 if heg is returned.
the format used is that from ecai/heg and it tries to find the id in the description of the histogram keys
Example
-------
Load a fasta file, compute codon histogram and then read in a list of highly expressed genes from
TODO:
then show the first 10 gene.ids that could be found
>>> f = load_fasta('Escherichia_coli_O157-H7_EDL933Refseq.ffn')
>>> chist,ahist = make_codon_histogram_dic(f)
>>> truth,ids = highly_expressed_genes_by_file('ecoli3.heg.txt',chist)
>>> ids[1:10]
['fid|127177|locus|Z4698|', 'fid|130454|locus|Z0298|', 'fid|129649|locus|Z2916|', 'fid|129950|locus|Z4588|', 'fid|128046|locus|Z5060|', 'fid|128982|locus|Z4737|', 'fid|129044|locus|Z4697|', 'fid|129074|locus|Z3827|', 'fid|130382|locus|Z4114|']
"""
#you need an ordered dictionary if you want to use is_heg
hist = collections.OrderedDict(hist)
is_heg = [0] * len( hist )
is_heg_id = []
try:
with open( filename ) as f:
data = [ line.split() for line in f ]
if not 'Synonym' in data[0][5]:
raise Exception("please use file from HEG database")
heg = [x[5] for x in ( data[3:] ) if len( x ) > 1]
i = 0
for gene in hist:
for test_heg in heg:
if test_heg in gene:
is_heg_id.append(gene)
is_heg[i] = 1
i += 1
except Exception as e:
print e
return is_heg,is_heg_id
[docs]def highly_expressed_genes_by_id(id_list,hist):
"""not implemented right yet"""
heg = id_list
is_heg = [False] * len( hist )
i = 0
for gene in hist:
for test_heg in heg:
if test_heg in gene:
is_heg[i] = True
i += 1
return is_heg
[docs]def remove_stopcodons_and_one_codon_amino_acids(codon_hist):
"""
Example
-------
"""
codon_hist = np.array(codon_hist)
fullrange = set( range( 64 ) )
for x in [15, 11, 10, 14, 35]:
fullrange.remove( x )
cleaned_range = list( fullrange )
return codon_hist[cleaned_range]
def remove_stopcodons_and_one_codon_amino_acids_dic(codon_hist):
fullrange = set( range( 64 ) )
for x in [15, 11, 10, 14, 35]:
fullrange.remove( x )
cleaned_range = list( fullrange )
codon_hist_cleaned={}
for gene in codon_hist:
codon_hist_cleaned[gene] = np.array(codon_hist[gene])[cleaned_range]
return codon_hist_cleaned
def mds(hist,n_components=2,max_iter=300,n_init=8,n_jobs=-1):
X = np.array( hist.values() )
mds = manifold.MDS( n_components, max_iter = 300, n_init = 8, n_jobs = -1 )
Y = mds.fit_transform( X )
return Y
def plot_mds(hist,n_components=2,max_iter=300,n_init=8,n_jobs=-1):
X = np.array( hist.values() )
mds = manifold.MDS( n_components, max_iter = 300, n_init = 8, n_jobs = -1 )
Y = mds.fit_transform( X )
p.plot( Y[:,0], Y[:,1],'.' )
p.show()
return Y
[docs]def isomap(hist,n_neighbors=10,n_components=2):
"""docstring for isomap"""
X = np.array( hist.values() )
Y = manifold.Isomap(n_neighbors,n_components).fit_transform(X)
return Y
[docs]def lle(hist,n_neighbors=10,n_components=2):
"""docstring for lle"""
X = np.array( hist.values() )
Y = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='auto',
method='standard').fit_transform(X)
return Y
[docs]def hessian_em(hist,n_neighbors=10,n_components=2):
"""docstring for hessian_em"""
X = np.array( hist.values() )
Y = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='auto',
method='hessian').fit_transform(X)
return Y
[docs]def spectral(hist,n_neighbors=10,n_components=2):
"""docstring for spectral"""
X = np.array( hist.values() )
se = manifold.SpectralEmbedding(n_components=n_components,
n_neighbors=n_neighbors)
Y = se.fit_transform(X)
return Y
def pca(hist,n_components=2):
""""""
X = np.array( hist.values() )
pca = decomposition.PCA()
pca.fit(X)
print 'explained variance'
print pca.explained_variance_
pca.n_components = n_components
X_reduced = pca.fit_transform(X)
return X_reduced
def plot_pca(hist,n_components=2,is_heg=None):
""""""
X = np.array( hist.values() )
pca = decomposition.PCA()
pca.fit(X)
print 'explained variance'
print pca.explained_variance_
pca.n_components = n_components
X_reduced = pca.fit_transform(X)
if is_heg is None:
p.plot( X_reduced[:,0], X_reduced[:,1],'.' )
elif is_heg is not None:
color = ['b']*len(is_heg)
i = 0
for gene in is_heg:
if is_heg[i] == 1:
color[i] = 'r'
i+=1
fig = p.figure( figsize=(15,8) )
ax = fig.add_subplot()
p.scatter( X_reduced[:,0], X_reduced[:,1],c=color )
p.show()
return X_reduced
def plot_all_reduction_methods(hist,n_neighbors=10,n_components=2,is_heg=None,n_subset=None):
X = np.array( hist.values() )
color = ['b']*len(X[0])
if is_heg is None:
pass
elif is_heg is not None:
color = ['b']*len(is_heg)
i = 0
for gene in is_heg:
if is_heg[i] == 1:
color[i] = 'r'
i+=1
if n_subset is not None:
X = X[0:n_subset,:]
try:
iso = manifold.Isomap(n_neighbors,n_components).fit_transform(X)
except Exception as e:
print e
try:
lle = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='auto',
method='standard').fit_transform(X)
except Exception as e:
print e
hessian = None
try:
hessian = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='auto',
method='hessian').fit_transform(X)
except Exception as e:
print e
print 'trying dense'
hessian = manifold.LocallyLinearEmbedding(n_neighbors, n_components,
eigen_solver='dense',
method='hessian').fit_transform(X)
pca = decomposition.PCA()
pca.n_components = n_components
pca = pca.fit_transform(X)
mds = manifold.MDS( n_components, max_iter = 300, n_init = 8, n_jobs = -1 )
mds = mds.fit_transform( X )
fig = p.figure()
ax_iso = fig.add_subplot(2,3,1 )
ax_iso.scatter(iso[:,0],iso[:,1],c=color)
ax_lle = fig.add_subplot(2,3,2 )
ax_lle.scatter(lle[:,0],lle[:,1],c=color)
if hessian is not None:
ax_hessian = fig.add_subplot(2,3,3 )
ax_hessian.scatter(hessian[:,0],hessian[:,1],c=color)
ax_mds = fig.add_subplot(2,3,4 )
ax_mds.scatter(mds[:,0],mds[:,1],c=color)
ax_pca = fig.add_subplot(2,3,5 )
ax_pca.scatter(pca[:,0],pca[:,1],c=color)
p.show()
def mds_from_fasta(filename):
fasta = load_fasta(filename)
codon_dic,aa_dic = make_codon_histogram_dic(fasta)
rscu = calculate_RSCU_dic(codon_dic)
Y=mds(rscu)
[docs]def setup_parser():
"""Setting up the parser for command line usage, returns args"""
usage = """%(prog)s <functional argument> <output target argument>"""
description = """Codon Bias Simulation"""
parser = argparse.ArgumentParser( usage = usage, description = description )
parser.add_argument( '-f', '--gene', action = 'store', nargs = 1, dest = 'experimental_data_filenames', help = 'File which contains Codon sequence of Genes which are to be used for calculating the amino acid frequencies' )
parser.add_argument( '-v', '--verbose', action = 'store_true', dest = 'verbose', default = False )
parser.add_argument( '-s', '--fitness_matrix', action = 'store', nargs = 1, dest = 'fitness_matrix_names', help = 'Fitness matrix, e.g. amino-acid similarity' )
parser.add_argument( '-i', '--fitness_functions', action = 'append', dest = 'fitness_functions_filenames', help = 'Fitness functions' )
parser.add_argument( '-c', '--configfile', action = 'store', dest = 'config_filename', help = 'Configuration file' )
#sys.argv.append( config )
args = parser.parse_args()
config = None
if args.config_filename is not None:
if "jsn" in args.config_filename:
with json.load( open( args.config_filename )) as config:
args.experimental_data_filenames = config["experimental_data_filenames"]
args.fitness_functions_filenames = config["fitness_functions_filenames"]
args.fitness_matrix_names = config["fitness_matrix_names"]
else:
print 'other config file formats not supported, use *.jsn ending please'
raise Exception
return config
def config_from_file(filename):
config = None
try:
config = json.load( open( filename ))
except Exception as e:
print e
return config
[docs]def c_codon_mut_dist( a, b ):
""" Calculates the number of transitions, transversions and staying-const when one codon mutates to another.
and is implemented in C with scipy.weave. A pure Python version for easier readability is implemented via
py_codon_mut_dist.
Parameters
----------
a: string with codon 1
b: string with codon 2
Returns
-------
List with three doubles `results` with `results[0]` containing the number of unchanged nucleotides,
`results[1]` number of transitions and `results[2]` number of transversions.
Example
-------
Given twice the same codon the returned distance is 3 'no-changes'
>>> c_codon_mut_dist('aaa','aaa')
[3, 0, 0]
Same for 3 transversion
>>> c_codon_mut_dist('aaa','ttt')
[0, 0, 3]
And for one of each
>>> c_codon_mut_dist('aaa','atg')
[1, 1, 1]
"""
results = [0, 0, 0]
support_code = r"""
int transition(char i,char j) {
if( ((i=='a') && (j=='g') ) || ((i=='g') && (j=='a') ) ) return 1;
if( ((i=='c') && (j=='t') ) || ((i=='t') && (j=='c') ) ) return 1;
return 0;
}
"""
code = """
int ti = 0;
int tv = 0;
for(int ci=0; ci<3;ci++) {
if (a[ci] == b[ci]) continue;
if (transition((a[ci]),(b[ci]))==1) {
ti += 1;
continue;
} else {
(tv += 1);
}
}
results[0] = 3-ti-tv;
results[1] = ti;
results[2] = tv;
"""
vars = ['results', 'a', 'b']
#libs = []
headers = ['<iostream>']
extra_compile_args = ['-O3']
compiler = 'gcc'
weave.inline( code, vars, headers = headers, compiler = compiler, support_code = support_code, extra_compile_args = extra_compile_args )
return results
def transition( i, j ):
if i == 'a' and j == 'g' or j == 'a' and i == 'g':
return True
if i == 'c' and j == 't' or j == 'c' and i == 't':
return True
return False
def py_codon_mut_dist( i, j ):
dist = [ 0, 0, 0 ]
for k in range( len( i ) ):
if i[k] == j[k]:
dist[0] += 1
if i[k] != j[k]:
if transition( i[k], j[k] ):
dist[1] += 1
else:
dist[2] += 1
return dist
codon_mut_dist = c_codon_mut_dist
[docs]class Fitnessfunction:
"""A Class for fitness functions!"""
def __init__( self, description, parameter, values, interpolation,filename='' ):
self.description = description
#parameter = parameter
self.parameter = parameter
self.fitnessvalues = np.array( values )
self.interpolation = interpolation
self.strength = 0.0
self.max = max( self.fitnessvalues )
self.filename = filename
def euclidean_distance( steady_state, codon_hist ):
aa_dist = np.linalg.norm( np.array( steady_state ) - np.array( codon_hist ) )
return aa_dist
def kl_distance( steady_state, codon_hist ):
aa_dist = np.dot( np.array( steady_state ), np.array( steady_state ) / np.array( codon_hist ) )
return aa_dist
def relative_euclidean_distance( steady_state, codon_hist ):
s = calculate_RF(steady_state)
c = calculate_RF(codon_hist)
c = remove_stopcodons_and_one_codon_amino_acids(c)
s = remove_stopcodons_and_one_codon_amino_acids(s)
aa_dist = np.linalg.norm( np.array(s) - np.array(c) )
return aa_dist
def relative_cosine_distance(steady_state,codon_hist):
s = calculate_RF(steady_state)
c = calculate_RF(codon_hist)
c = remove_stopcodons_and_one_codon_amino_acids(c)
s = remove_stopcodons_and_one_codon_amino_acids(s)
aa_dist = scipy.spatial.distance.cosine(np.array(s),np.array(c))
return aa_dist
[docs]def relative_jeffrey_divergence(steady_state,codon_hist):
"""TODO: not implemented"""
print 'not implemented!'
return 0
[docs]def relative_minkowski_distance(steady_state,codon_hist):
"""TODO: not implemented"""
print 'not implemented!'
return 0
[docs]def relative_hellinger_distance(steady_state,codon_hist):
"""TODO: not implemented"""
print 'not implemented!'
return 0
#compute_distance = euclidean_distance
#compute_distance = relative_euclidean_distance
compute_distance = relative_cosine_distance
def compute_steady_state( evolmatrix, aa_hist ):
""" The steady state is the eigenvector belonging to the largest eigenvalue of the evolution matrix for a specific amino acid """
steady_state = np.zeros( 64 )
for A in range( 21 ):
[eigenvalues, eigenvectors] = ( sp.linalg.eig( evolmatrix[A] , overwrite_a = True, overwrite_b = True ) )
biggest_eigenvalue_index = list( eigenvalues ).index( max( eigenvalues ) )
# The steady state given P(codon|amino_aicd) can be calculated as P(codon) = \sum P(Codon|amino_acid) P(amino_acid) and can be normalized additionally
try:
steady_state += ( abs( np.array( ( eigenvectors.T )[biggest_eigenvalue_index] / sum( eigenvectors[biggest_eigenvalue_index] ) ) ) * aa_hist[A] )
except Exception as e:
print 'something went wront'
print biggest_eigenvalue_index
print eigenvectors
print aa_hist[A]
return steady_state / np.sum( steady_state )
[docs]def make_jc69_mutationmatrix(mu=0,
alpha=0,
beta=0):
""" ups, not correct! read models_of_dna_evolution on wiki """
if ( alpha is not 0 ) and (beta is not 0):
mu = alpha + 2*beta
eq = ( 1. - 3./4.*mu)
neq = mu/4.
for i in codons:
for j in codons:
if i == j:
mutationmatrix[codon_index[ i ]][codon_index[ j ]] = eq
else:
mutationmatrix[codon_index[ i ]][codon_index[ j ]] = neq
return mutationmatrix
def make_mutationmatrix(
alpha=0,
beta=0):
mu = alpha + 2 * beta
mutationmatrix = np.zeros( ( 64, 64 ) )
for i in codons:
for j in codons:
dist = codon_mut_dist( i, j )
mutationmatrix[codon_index[ i ]][codon_index[ j ]] = ( 1 - mu ) ** dist[0] * alpha ** dist[1] * beta ** dist[2]
return mutationmatrix
[docs]def make_evolmatrix( mutationmatrix,
fitnessfunctions,
fitnessmatrices,
selection,
additive=False
):
"""Given the mutationmatrix, the fitnessfunctions, the fitnessmatrices (the amino-acid identity matrix) and the selection strength, this builds the evolutionmatrix"""
evolmatrix = np.zeros( ( 21, 64, 64 ) )
if additive==True:
fit = np.array( [0.]* 64 )
for A in range( 21 ):
evolmatrix[A] = np.array( mutationmatrix )
for fitnessfunction in fitnessfunctions:
if len( fitnessfunction.fitnessvalues ) == 64:
if additive == False:
evolmatrix[A] = evolmatrix[A] * (
1. - fitnessfunction.strength *
( ( fitnessfunction.max ) - ( fitnessfunction.fitnessvalues ) )
/ ( fitnessfunction.max )
)
elif additive == True:
fit += (
1. - fitnessfunction.strength *
( ( fitnessfunction.max ) - ( fitnessfunction.fitnessvalues ) )
/ ( fitnessfunction.max )
)
if additive == True:
evolmatrix[A] = evolmatrix[A] * fit
for fitnessmatrix in fitnessmatrices:
Ai = codons.index( rev_codon_table[amino_acids_unique[A]][0] )
if np.abs( fitnessmatrix[Ai][Ai] ) < np.finfo(float).eps :
evolmatrix[A] = evolmatrix[A] * 0.
continue
evolmatrix[A] = evolmatrix[A] * (
1. - selection *
( fitnessmatrix[Ai][Ai] - ( fitnessmatrix[Ai] ) )
/ fitnessmatrix[Ai][Ai]
)
evolmatrix = np.array( evolmatrix )
return evolmatrix
[docs]def compute_steady_state( evolmatrix, aa_hist ):
""" The steady state is the eigenvector belonging to the largest eigenvalue of the evolution matrix for a specific amino acid """
steady_state = np.zeros( 64 )
for A in range( 21 ):
[eigenvalues, eigenvectors] = ( sp.linalg.eig( evolmatrix[A] , overwrite_a = True, overwrite_b = True ) )
# [eigenvalues, eigenvectors] = ( sp.sparse.linalg.eigs( evolmatrix[A], k = 2 ) )
# sp.sparse
# [eigenvalues, eigenvectors] = ( np.linalg.eig( evolmatrix[A] ) )
biggest_eigenvalue_index = list( eigenvalues ).index( max( eigenvalues ) )
# The steady state given P(codon|amino_aicd) can be calculated as P(codon) = \sum P(Codon|amino_acid) P(amino_acid) and can be normalized additionally
steady_state += ( abs( np.array( ( eigenvectors.T )[biggest_eigenvalue_index] / sum( eigenvectors[biggest_eigenvalue_index] ) ) ) * aa_hist[A] )
return steady_state / sum( steady_state )
[docs]def run_model(parameters,fitnessfunctions,fitnessmatrices,aa_hist):
"""
parameters = alpha,beta,selection,t0,t1,t2,t3...
"""
i = 3
for fitnessfunction in fitnessfunctions:
fitnessfunction.strength = parameters[i]
i+=1
mutationmatrix = make_mutationmatrix(alpha=parameters[0],beta=parameters[1])
evolmatrix = make_evolmatrix( mutationmatrix, fitnessfunctions, fitnessmatrices, parameters[2] )
result = compute_steady_state(evolmatrix,aa_hist)
return result
import time
import inspyred
import inspyred.ec
import random
import multiprocessing as mp
class Model():
def __init__(self, fitnessfunctions,fitnessmatrices,aa_hist,c_hist):
self.fitnessfunctions = fitnessfunctions
self.fitnessmatrices = fitnessmatrices
self.aa_hist = aa_hist
self.codon_hist = c_hist
def run(self,parameters):
self.results = run_model(parameters,self.fitnessfunctions,self.fitnessmatrices,self.aa_hist )
self.distance = compute_distance( self.results, self.codon_hist )
return self.distance
def run_log(self,parameters):
lparameters = np.array( parameters) ** 10
self.results = run_model(lparameters,self.fitnessfunctions,self.fitnessmatrices,self.aa_hist )
self.distance = compute_distance( self.results, self.codon_hist )
return self.distance
def generate_quasispecies(self,random,args):
#size = args.get()
#alpha, beta, selection + num_fitnessfunctions
size = 3 + len(self.fitnessfunctions)
return [random.expovariate(1e5) for i in range(size)]
#return [random.uniform(0.0,1.0) for i in range(size)]
def generate_quasispecies_old_opt(self,random,args):
opt = args['old_opt']
#alpha, beta, selection + num_fitnessfunctions
size = 3 + len(self.fitnessfunctions)
return [random.expovariate(1./opt[i]) for i in range(size)]
#return [random.uniform(0.0,1.0) for i in range(size)]
def evaluate_quasispecies(self,candidates,args):
fitness = []
for candidate in candidates:
fit = self.run(candidate)
fitness.append(fit)
#print candidate,fit
return fitness
#def optimize_evolutionary(self,pop_size=2500,max_eval=100000,mutrate=0.25):
def optimize_evolutionary(self,pop_size=2500,max_eval=100000,mutrate=0.35):
rand = random.Random()
rand.seed(int(time.time()))
es = inspyred.ec.ES(rand)
es.terminator = inspyred.ec.terminators.evaluation_termination
final_pop = es.evolve(generator = self.generate_quasispecies,
evaluator = self.evaluate_quasispecies,
pop_size=pop_size,
maximize=False,
bounder=inspyred.ec.Bounder(0.0,1.0),
max_evaluation=max_eval,
mutation_rate = mutrate)
final_pop.sort(reverse=True)
#print final_pop[0]
return final_pop[0]
def optimize_pso(self,pop_size=500,max_eval=100000,neighborhood_size=5,old_opt=None):
rand = random.Random()
rand.seed(int(time.time()))
ea = inspyred.swarm.PSO(rand)
ea.topology = inspyred.swarm.topologies.ring_topology
ea.terminator = inspyred.ec.terminators.evaluation_termination
final_pop = ea.evolve(generator = self.generate_quasispecies_old_opt,
evaluator = self.evaluate_quasispecies,
pop_size=pop_size,
maximize=False,
bounder=inspyred.ec.Bounder(0.0,1.0),
max_evaluation=max_eval,
old_opt = old_opt)
final_pop.sort(reverse=True)
#print final_pop[0]
return final_pop[0]
def init_fitnessmatrix_for_codons(config=None,
filenames=None
):
#TODO: not implemented
pass
[docs]def init_fitnessmatrix_for_amino_acids(config=None,
filenames=None,
amino_acid_order=list("ARNDCEQGHIKLMFPSTWYV")
):
""" Load fitnessmatrix form file. Either config from file or list of filenames
Example
-------
Load the mclachlan72 fitness matrix that expresses how good an amino acid i can be represented by an amino acid j.
Then we look at which codons have nonzero fitness to code for tct which corresponds to the amino acid S. as a good test,
all synonymous codons of S (tct,tcg,tcc,tca,agc,agt) should have nonzero fitness!
>>> fit = init_fitnessmatrix_for_amino_acids(filenames=['mclachan72.csv'])# doctest : +NORMALIZE_WHITESPACE
>>> np.array(codons)[fit[0][4,:]!=0] # doctest : +NORMALIZE_WHITESPACE
array(['tct', 'tcc', 'tca', 'tcg', 'tgt', 'tgc', 'caa', 'cag', 'act',
'acc', 'aca', 'acg', 'aat', 'aac', 'agt', 'agc', 'gct', 'gcc',
'gca', 'gcg'],
dtype='|S3')
"""
if config is not None and filenames is not None:
print 'two optional options given, I just use the config= ones...'
if config is not None:
filenames = config["fitness_matrix_names"]
if config is None and filenames is None:
print 'either config = configdict or list of filenames as filenames'
fitnessmatrices_amino_acids = []
fitnessmatrixparameters = []
number_fitness_matrices = 0
for fitness_matrix_name in filenames:
if "jsn" in fitness_matrix_name:
fitnessmatrixdata = json.load( open( fitness_matrix_name ) )
if "csv" in fitness_matrix_name:
fitnessmatrices_amino_acids.append(
np.array(
[ map( float, line.split( ',' ) ) for line in open( fitness_matrix_name ) ]
)
)
fitnessmatrixparameters.append( str( number_fitness_matrices ) )
fitnessmatrices = []
amino_acids_mc = amino_acid_order
for fitnessmatrix_amino_acid in fitnessmatrices_amino_acids:
fitnessmatrix = np.zeros( ( 64, 64 ) )
for codon_index_i in range( 64 ):
for codon_index_j in range( 64 ):
if codons[codon_index_i] in ['tag', 'tga', 'taa'] or codons[codon_index_j] in ['tag', 'tga', 'taa']:
# stop codons
continue
"""for all codons by index, look at what codon corresponds to that index, look which amino acid corresponds to that codon,
than look into the alphabet sorted amino acids and look at which position the amino acid can be found --- this gives the
amino acid index for the maclachlan matrix"""
amino_acid_i = amino_acids_mc.index( codon_table[codons[codon_index_i]] )
amino_acid_j = amino_acids_mc.index( codon_table[codons[codon_index_j]] )
fitnessmatrix[codon_index_i][codon_index_j] = fitnessmatrix_amino_acid[amino_acid_i][amino_acid_j]
fitnessmatrices.append( fitnessmatrix )
return fitnessmatrices
[docs]def init_fitnessfunctions(config=None,filenames=None):
""" Loads fitnessfunctions from file. Either config from file oder list of filenames.
Example
-------
Given an example file, `trna_pool.jsn` in which a measure for the trna pool of e.coli is given
a list with fitnessfunctions of length 1 and a fitnessfunction with the right fitnessvalues should
be loaded
>>> a = init_fitnessfunctions(filenames=['trna_pool.jsn'])# doctest : +NORMALIZE_WHITESPACE
>>> a[0].fitnessvalues
array([ 3.27, 3.27, 3.57, 9.61, 6.5 , 2.41, 4.09, 5.18,
6.41, 6.41, 1. , 1. , 5.01, 5.01, 1. , 2.98,
2.97, 2.97, 2.1 , 16.21, 4.1 , 2.27, 1.83, 4.67,
2.02, 2.02, 2.41, 2.78, 15. , 15. , 15. , 2.01,
10.96, 10.96, 10.96, 8.31, 6.67, 3.78, 2.89, 4.6 ,
3.77, 3.77, 6.08, 6.08, 4.44, 4.44, 2.74, 1.23,
16.11, 3.99, 12.12, 12.12, 10.25, 1.95, 10.25, 10.25,
7.56, 7.56, 14.88, 14.88, 13.76, 13.76, 6.75, 6.75])
"""
fitnessfunctions = []
if config is not None and filenames is not None:
print 'two optional options given, I just use the config= ones...'
if config is not None:
filenames = config["fitness_functions_filenames"]
if config is None and filenames is None:
print 'either config = configdict or list of filenames as filenames'
for fitness_function_filename in filenames:
try:
fitnessdata = json.load( open( fitness_function_filename ) )
fitnessfunctions.append( Fitnessfunction( fitnessdata["description"], fitnessdata["parameter"], fitnessdata["fitnessvalues"], fitnessdata["interpolation"] ) )
except Exception as e:
print 'reading filtertnessfunction ', fitness_function_filename, ' failed'
print e
if len( fitnessfunctions ) == 0:
print "no fitnessfunctions read!"
return fitnessfunctions
from scipy.optimize import minimize
def optimize(model,method=None):
#alpha,beta,selection,t0,t1,t2
x0 = [1.0,1.0,1e-6,1e-6,1e-6,1e-6]
res = minimize(model.run,x0,method='powell')
return res
def sample_run():
filename = 'Escherichia_coli_O157-H7_EDL933Refseq.ffn'
f = load_fasta(filename)
chist,ahist = make_codon_histogram_dic(f)
fitmat = init_fitnessmatrix_for_amino_acids(filenames=['mclachan72.csv'])
fitfu = init_fitnessfunctions(filenames=['trna_pool.jsn'])
result = run_model([1.0, 1.0, 1.0,1.0],fitfu,fitmat,ahist.itervalues().next() )
print result
if np.abs( result[0] - 0.0114015) < 1e-5:
print 'seems okay!'
else:
print 'something is wrong?'
def parametric_run_from_config(config,toolbar=False,output=True):
# RUN
config = config_from_file(config)
print config['experimental_data_filenames']
f = load_fasta(config['experimental_data_filenames'][0])
codon_hist,aa_hist = make_codon_histogram_dic( f )
codon_hist = codon_hist.values()[0]
aa_hist = aa_hist.values()[0]
output = []
out_config = {}
out_config["time"] = time.time()
out_config["aa_hist"] = aa_hist
out_config["aa_hist_aas"] = rev_codon_table.keys()
out_config["codon_hist"] = codon_hist
output.append( out_config )
fitnessmatrices = init_fitnessmatrix_for_amino_acids(config=config)
fitnessfunctions= init_fitnessfunctions(config=config)
# `toolbar` if wanted
if toolbar:
try:
toolbar_width = len( np.arange( float( config["run"]["fitnessmatrix"]["selection"]["start"] ), float( config["run"]["fitnessmatrix"]["selection"]["end"] ), float( config["run"]["fitnessmatrix"]["selection"]["step"] ) ) )
# setup toolbar
sys.stdout.write( "[%s]" % ( " " * toolbar_width ) )
sys.stdout.flush()
sys.stdout.write( "\b" * ( toolbar_width + 1 ) ) # return to start of line, after '['
except Exception:
print "autsch"
selection_range = []
if config["run"]["fitnessmatrix"]["selection"]["scale"] == "linear":
selection_range = np.arange( float( config["run"]["fitnessmatrix"]["selection"]["start"] ), float( config["run"]["fitnessmatrix"]["selection"]["end"] ), float( config["run"]["fitnessmatrix"]["selection"]["step"] ) )
if config["run"]["fitnessmatrix"]["selection"]["scale"] == "log":
selection_range = np.arange( float( config["run"]["fitnessmatrix"]["selection"]["start"] ), float( config["run"]["fitnessmatrix"]["selection"]["end"] ), float( config["run"]["fitnessmatrix"]["selection"]["step"] ) )
selection_range = map( lambda x:10 ** ( -x ), selection_range )
#WUWUWU!!!!
fitnessfunction_strengths = list(
itertools.product(
*[ np.arange( config["run"]["fitnessfunctions"][fitnessfunction_index]["start"],
config["run"]["fitnessfunctions"][fitnessfunction_index]["end"],
config["run"]["fitnessfunctions"][fitnessfunction_index]["step"] ) for fitnessfunction_index in range( len( fitnessfunctions ) ) ]
)
)
fitnessfunction_strengths = {}.fromkeys( fitnessfunction_strengths ).keys()
for selection in selection_range:
if toolbar:
sys.stdout.write( "-" )
sys.stdout.flush()
alpha = config["run"]["fitnessmatrix"]["alpha"]
beta = config["run"]["fitnessmatrix"]["beta"]
mu = alpha + 2 * beta
mutationmatrix = make_mutationmatrix(alpha=alpha,beta=beta)
for fitnessfunction_strength in fitnessfunction_strengths:
for fitnessfunction_index in range( len( fitnessfunctions ) ):
if fitnessfunctions[fitnessfunction_index].interpolation == "linear":
fitnessfunctions[fitnessfunction_index].strength = fitnessfunction_strength[fitnessfunction_index]
if fitnessfunctions[fitnessfunction_index].interpolation == "log":
fitnessfunctions[fitnessfunction_index].strength = 10 ** ( -fitnessfunction_strength[fitnessfunction_index] )
#if( abs( mu - ( alpha + 2 * beta ) ) > 10e-10 ):
#print "mu should be mu = alpha + 2beta"
#print str( mu ) + '!=' + 'alpha+2beta=' + str( alpha + 2 * beta )
#print abs( mu - ( alpha + 2 * beta ) )
# Evolmatrix
evolmatrix = make_evolmatrix( mutationmatrix, fitnessfunctions, fitnessmatrices, selection )
# out_sel = """ \"selection\":{0} """.format(selection)
# out_fitness = []
# for fitnessfunction_i in range(len(fitnessfunctions)):
# out_fitness.append( """ \"{0}\":{1} """.format(fitnessfunction_i,str(fitnessfunction[fitnessfunction_i].strength)))
steady_state = compute_steady_state( evolmatrix, aa_hist )
out = {}
out["selection"] = selection
#print [ [fitnessfunction_i, fitnessfunctions[fitnessfunction_i].strength] for fitnessfunction_i in range( len( fitnessfunctions ) ) ]
out["fitness"] = np.array( [ [fitnessfunction_i, fitnessfunctions[fitnessfunction_i].strength] for fitnessfunction_i in range( len( fitnessfunctions ) ) ] )
out["steady_state"] = list( map( str, steady_state ) )
out["codon_distance"] = compute_distance( steady_state, codon_hist )
out["mu_alpha_beta"] = [mu, alpha, beta]
output.append( out )
# print np.arange(config["run"]["fitnessmatrix"]["selection"]["start"],config["run"]["fitnessmatrix"]["selection"]["end"],config["run"]["fitnessmatrix"]["selection"]["step"])
# print config["run"]["fitnessfunctions"]
plotdata = []
for result in output:
# print result
# print result
try:
# print result['selection'], result['aa_distance']
plotdata.append( [result['selection']] + [ fivalue for fivalue in result['fitness'][:, 1] ] + [result['codon_distance']] )
#print [result['selection']] + [ fivalue for fivalue in result['fitness'][:][1] ] + [result['codon_distance']]
except Exception:
pass
#print "au"
# print result['steady_state'][0]
#print ( sorted( plotdata, key = lambda x:x[-1] ) ) [0:10]
idx = plotdata.index( ( sorted( plotdata, key = lambda x:x[-1] ) ) [0] )
#print 'done'
#print plotdata[idx]
print config["name"],
for x in plotdata[idx]:
print str( x ),
if output:
of = open( "results.jsn", "w+" )
for out in output:
try:
out["fitness"] = list(map(list,out["fitness"]))
except Exception:
pass
json.dump( output, of, indent = 2 )
of.close()
[docs]def optimize_sequence(steady_state,target_sequence):
"""given the steady state and the target sequence, let us optimize!"""
#translate target_sequence
#make chist and ahist from target sequence
#calculate relative frequency of steady state
#formats codon in target_sequence:
#
def compute_optimum_two_step(f,fitfu,fitmat, gene, id ):
chist,ahist = make_codon_histogram(gene)
mymod = Model(fitfu,fitmat,ahist,chist)
ss = mymod.optimize_evolutionary()
opt = ss.candidate[:len(fitfu)+3]
#print 'old fit'
#print ss.fitness
ss = mymod.optimize_pso(old_opt=opt)
opt = ss.candidate[:len(fitfu)+3]
#print 'new fit'
#print ss.fitness
steady_state = mymod.run(opt)
result = opt
print str(id) + '\t' + gene.id + '\t',
print ss.fitness,
for point in result:
print point,
print '\n',
f.write(id)
for point in result:
f.write(str(point)+'\t')
f.write('\n')
f.flush()
return result
def compute_optimum(f,fitfu,fitmat, gene, id ):
chist,ahist = make_codon_histogram(gene)
mymod = Model(fitfu,fitmat,ahist,chist)
opt = mymod.optimize_evolutionary().candidate[:len(fitfu)+3]
steady_state = mymod.run(opt)
result = opt
print id,
for point in result:
print point,
print '\n',
f.write(id)
for point in result:
f.write(str(point)+'\t')
f.write('\n')
f.flush()
return result
def init_list_of_genes(fasta_filename,minimal_number_of_nucleotides_per_gene):
list_of_genes = []
f = load_fasta(fasta_filename)
for gene in f:
if len(gene.seq.tostring()) > minimal_number_of_nucleotides_per_gene:
list_of_genes.append( gene )
return list_of_genes
[docs]def optimize_dic(fasta_filename,fitfu,fitmat,minimal_number_of_nucleotides_per_gene=500):
"""
The probability when we put r number of codons in a gene into n bins of different codons to find a codon with k occurences in the gene is
..math:
p_k = \left( \frac{r}{k}) \right) \frac{(n-1)^{r-k}}\frac{n^r}
hence, the probability of finding more than one occurence is
..math:
p_{k>1} (63/64)^r( (64/63)^r - 1 )
if we want to to be reasonably sure our gene contains at least one codon from every kind we have to solve the inequality
p_{k>1} > p-value. A p-value of 0.95 implies we have to use a gene with at least 500 nucleotides
"""
n_cpu = mp.cpu_count()
pool = mp.Pool( 12 )
f = open('optresuls','w+')
list_of_genes = init_list_of_genes(fasta_filename,minimal_number_of_nucleotides_per_gene)
for gene in list_of_genes:
id = list_of_genes.index( gene )
pool.apply_async( compute_optimum_two_step, ( f,fitfu,fitmat,gene, id, ) )
#print id
pool.close()
pool.join()
print 'done'
f.close()
if __name__ == "__main__":
doctest.testmod()
#mygui = cobigui()
#mygui.start_gui()