import glob
import os
import re
import requests
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqIO import SeqRecord
from Bio import SeqIO
from .utils import is_fasta
[docs]class PrimerDesigner:
"""Class for designing primers from FASTA files.
It will send a FASTA alignment to `primers4clades`_ in order to design
degenerate primers. Input data needed is an alignment in FASTA format
containing at least 4 sequences.
It is recommended that the beginning of each FASTA sequence description
contains the taxon name between square brackets.
Parameters:
folder (str): path of folder containing the FASTA file alignments
taxon_for_codon_usage (str): optional taxon name that will be inserted in the
description of FASTA sequences between square
brackets so that can be used by primer4clades
to infer the codon table to use
tm (str): temperature
min_amplength (str): minimum amplicon length
max_amplength (str): maximum amplicon length
gencode (str): genetic code. See below for all available genetic
codes
clustype (str): cluster distance metric: ``dna``, ``protein``.
amptype (str): substitution model used to estimate phylogenetic
information
email (str): your email address so that primer4clades can send
you email with detailed results
Example:
>>> # The values shown are the default. Change them if needed.
>>> from primer_designer import PrimerDesigner
>>> pd = PrimerDesigner()
>>> pd.folder = "alignments" # folder containing the FASTA file alignments
>>> pd.tm = "55" # annealing temperature
>>> pd.min_amplength = "250" # minimum amplicon length
>>> pd.max_amplength = "500" # maximum amplicon length
>>> pd.gencode = "universal" # see below for all available genetic codes
>>> pd.mode = "primers"
>>> pd.clustype = "dna"
>>> pd.amptype = "dna_GTRG" # substitution model used to estimate phylogenetic information
>>> pd.email = "youremail@email.com" # primer4clades will send you an email with very detailed results
>>> pd.design_primers()
The best primer pairs will be printed to your screen. Detailed results will
be saved as HTML files in your alignments folder. But it is recommended if
you also get the results by email. primers4clades_ will send you one email
for each alignment.
The genetic code table (variable ``gencode``) can be any of the following:
* ``universal`` for standard
* ``2`` for vertebrate mitochondrial
* ``3`` for yeast mitochondrial
* ``4`` for mold and protozoa mitochondrial
* ``5`` for invertebrate mitochondrial
* ``6`` for ciliate
* ``9`` for echinoderm and flatworm
* ``10`` for euplotid nuclear
* ``11`` for bacterial and plastid
* ``12`` for alternative yeast nuclear
* ``13`` for ascidian mitochondrial
* ``14`` for flatworm mitochondrial
* ``15`` for Blepharisma nuclear
* ``16`` for Chlorophycean mitochondrial
* ``21`` for Trematode mitochondrial
* ``22`` for Scenedesmus obliquus mitochondrial
* ``23`` for Thraustochytrium mitochondrial
The evolutionary substitution model can be any of the following (variable
``amptype``):
* ``protein_WAGG`` for protein WAG+G
* ``protein_JTTG`` for protein JTT+G
* ``protein_Blosum62G`` for protein Blosum62+G
* ``protein_VTG`` for protein VT+G
* ``protein_DayhoffG`` for protein Dayhoff+G
* ``protein_MtREVG`` for protein MtREV+G
* ``dna_HKYG`` for dna HKY+G
* ``dna_GTRG`` for dna GTR+G
* ``dna_K80G`` for dna K80+G
* ``dna_TrNG`` for dna TrN+G
* ``dna_JC69G`` for dna JC69+G
.. _primers4clades: http://floresta.eead.csic.es/primers4clades/#0
"""
def __init__(self, folder=None, taxon_for_codon_usage=None, tm="55",
min_amplength="100", max_amplength="500", gencode="universal",
mode="primers", clustype="dna", amptype="dna_GTR", email=None):
self.folder = folder
self.taxon_for_codon_usage = taxon_for_codon_usage
self.tm = tm
self.min_amplength = min_amplength
self.max_amplength = max_amplength
self.gencode = gencode
self.mode = mode
self.clustype = clustype
self.amptype = amptype
self.email = email
self.designed_primers = []
[docs] def design_primers(self):
if os.path.exists(self.folder):
all_files = os.path.join(self.folder, "*")
alns = glob.glob(all_files)
if alns:
for aln in alns:
if is_fasta(aln):
if self.taxon_for_codon_usage:
aln = self.insert_taxon_in_new_fasta_file(aln)
print("\nProcessing file \"{0}\"".format(aln))
r = self.request_primers(aln)
self.process_response(aln, r.text)
# Write primers to alignment file
SeqIO.write(self.designed_primers, "primers.fasta", "fasta")
print("\nDone.\nAll primers have been saved in the file \"primers.fasta\"")
return self.designed_primers
else:
print("\nError! the folder {0} is empty.\n".format(self.folder))
else:
print("\nError! the folder {0} does not exist.\n".format(self.folder))
[docs] def insert_taxon_in_new_fasta_file(self, aln):
"""primer4clades infers the codon usage table from the taxon names in the
sequences.
These names need to be enclosed by square brackets and be
present in the description of the FASTA sequence. The position is not
important. I will insert the names in the description in a new FASTA
file.
Returns:
Filename of modified FASTA file that includes the name of the taxon.
"""
new_seq_records = []
for seq_record in SeqIO.parse(aln, 'fasta'):
new_seq_record_id = "[{0}] {1}".format(self.taxon_for_codon_usage, seq_record.id)
new_seq_record = SeqRecord(seq_record.seq, id=new_seq_record_id)
new_seq_records.append(new_seq_record)
base_filename = os.path.splitext(aln)
new_filename = '{0}_modified{1}'.format(base_filename[0], base_filename[1])
SeqIO.write(new_seq_records, new_filename, "fasta")
return new_filename
[docs] def process_response(self, aln, response_body):
this_file = os.path.split(aln)[1]
this_file = re.sub(".fas.*", "", this_file)
msg = 'Writing detailed results as file "{0}.html"'.format(aln)
print(msg)
with open("{0}.html".format(aln), "w") as handle:
handle.write(response_body)
# Show primer pair to user
html_file = response_body.split("\n")
i = 1
while i < 4:
for line in html_file:
if "degen_corr" in line:
seq = line.split(" ")[0].strip()
description = line.split(" ")[2].strip()
this_id = this_file + "_" + line.split(" ")[1].strip()
this_id += "_" + str(i)
seq = Seq(seq, IUPAC.ambiguous_dna)
seq_record = SeqRecord(seq)
seq_record.id = this_id
seq_record.description = description
self.designed_primers.append(seq_record)
i += 1
[docs] def request_primers(self, aln):
url = "http://floresta.eead.csic.es/primers4clades/primers4clades.cgi"
params = {
'tm': self.tm,
'min_amplength': self.min_amplength,
'max_amplength': self.max_amplength,
'mode': self.mode,
'gencode': self.gencode,
'clustype': self.clustype,
'email': self.email,
}
files = {'sequencefile': open(aln, 'rb')}
r = requests.post(url, files=files, data=params)
return r