Source code for pydna.assembly

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Provides functions for assembly of sequences by homologous recombination.
Given a list of sequences (Dseqrecords), all sequences will be analyzed for
overlapping regions of DNA (common substrings).

The assembly algorithm is based on forming a network where each
overlapping sequence forms a node and intervening sequences form edges.

Then all possible linear or circular assemblies will be returned in the
order of their length.


'''

import networkx as nx
import sys
import operator
import Bio.SeqIO
import itertools
import copy

from Bio.Seq                 import Seq
from Bio.Seq                 import reverse_complement as rc
from Bio.SeqUtils.CheckSum   import seguid

from findsubstrings_suffix_arrays_python        import common_sub_strings
#from find_sub_strings         import common_sub_strings
from Bio.Alphabet.IUPAC      import ambiguous_dna
from Bio.SeqFeature          import SeqFeature, FeatureLocation, ExactPosition
from pydna._simple_paths7    import all_circular_paths_edges
from pydna._simple_paths8    import all_simple_paths_edges
from pydna.dsdna             import Dseqrecord


[docs]def circular_assembly(form_rec_list, limit=25): '''Accepts a list of Dseqrecords and tries to assemble them into a circular assembly by homologous recombination based on shared regions of homology with a minimum length given by limit. Parameters ---------- form_rec_list : list a list of Dseqrecord objects. limit : int, optional limit is set to 25 by default. Returns ------- frecs, cp : tuple frecs are the same Dseqrecords as given as arguments, but with the regions of homology added to the features. cp is a list of Dseqrecords representing the circular products sorted by length (long -> short). ''' frecs, G = _make_graph(form_rec_list, limit) G.remove_nodes_from(('5','3')) for cycle in nx.simple_cycles(G)[1:]: circular_paths = [(cycle+cycle[1:])[n:len(cycle)+n] for n in range(len(cycle)-1)] for circular_path in circular_paths: keynode = circular_path[0] x=[G[u][v][0]['sek'] for u,v in zip(circular_path, circular_path[1:])] #collect edges around cycle y=[G.node[node]['sek'] for node in circular_path] #collect nodes around cycle sm = reduce(lambda x,y:x+y, [s for s in list(itertools.chain.from_iterable(itertools.izip_longest(y,x)))[:-1]]) incoming = [(n, keynode) for n in G.predecessors(keynode) if n not in circular_path] outgoing = [(keynode, n) for n in G.successors(keynode) if n not in circular_path] new_node = seguid(sm.seq) G.add_node(new_node, sek=sm) G.add_edges_from( [(p[0], new_node, {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in incoming] ) G.add_edges_from( [(new_node, p[1], {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in outgoing] ) circular_products=[] unique_lengths=set() for path in all_circular_paths_edges(G): result = Dseqrecord("") for first_node, second_node, edgedict in path: e=edgedict['sek'] if e.seq.watson == len(e.seq.watson)*"-": result+=G.node[second_node]['sek'][len(e):] else: result+=e result+=G.node[second_node]['sek'] circular_products.append(Dseqrecord(result, circular = True)) unique_lengths.add(len(result)) unique_circular_products=circular_products[:] for le in unique_lengths: lst = [se for se in circular_products if len(se)==le] a = lst.pop(0) for b in lst: if (str(a.seq).lower() in str(b.seq).lower()*2 or str(a.seq).lower() in str(b.seq.reverse_complement()).lower()*2 ): unique_circular_products.remove(b) unique_circular_products.sort(key=len, reverse=True) for cp in unique_circular_products: length = len(cp) cp.description = "circular assembly product {}".format(length) assert cp.linear==False cp2 = Dseqrecord(cp, linear=True) + Dseqrecord(cp, linear=True) osf = [feature for feature in cp.features if "from_left" in feature.qualifiers] cp.features = [feature for feature in cp.features if not "from_left" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["from_left"] if feature.strand == 1 and str(seq).lower()==str(cp2[feature.location.start:feature.location.start+len(seq)].seq).lower(): begin = feature.location.start end = feature.location.start+len(seq) elif feature.strand == -1 and str(seq).lower()==str(rc(cp2[feature.location.start:feature.location.start+len(seq)].seq)).lower(): begin = feature.location.start end = feature.location.start+len(seq) else: continue if end > length: a = SeqFeature(FeatureLocation(feature.location.start, length), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None) b = SeqFeature(FeatureLocation(0, feature.location.end-length), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None) newf = SeqFeature(FeatureLocation(a.location.start, b.location.end), type=feature.type, location_operator="join", strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=[a,b]) else: newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="from_left"}, sub_features=None,) cp.features.append(newf) osf = [feature for feature in cp.features if "to_right" in feature.qualifiers] cp.features = [feature for feature in cp.features if not "to_right" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["to_right"] if feature.strand == 1 and str(seq).lower()==str(cp2[length+feature.location.start-len(seq):length+feature.location.start].seq).lower(): begin = feature.location.start-len(seq) end = feature.location.start elif feature.strand == -1 and str(seq).lower()==str(rc(cp2[feature.location.start-len(seq):feature.location.start].seq)).lower(): begin = feature.location.start-len(seq) end = feature.location.start else: continue if feature.location.start < 0: a = SeqFeature(FeatureLocation(length+feature.location.end-len(seq), end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None) b = SeqFeature(FeatureLocation(0, feature.location.end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None) newf = SeqFeature(FeatureLocation(a.location.start, b.location.end), type=feature.type, location_operator="join", strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=[a,b]) else: newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="to_right"}, sub_features=None,) cp.features.append(newf) return frecs, unique_circular_products
[docs]def linear_assembly(form_rec_list, limit=25): '''Accepts a list of Dseqrecords and tries to assemble them into a linear assembly by homologous recombination based on shared regions of homology with a minimum length given by limit. Parameters ---------- form_rec_list : list a list of Dseqrecord objects. limit : int, optional limit is set to 25 by default. Returns ------- frecs, lp : tuple frecs are the same Dseqrecords as given as arguments, but with the regions of homology added to the features. lp is a list of Dseqrecords representing the linear products sorted by length (long -> short). ''' frecs, G = _make_graph(form_rec_list, limit) for cycle in nx.simple_cycles(G): circular_paths = [(cycle+cycle[1:])[n:len(cycle)+n] for n in range(len(cycle)-1)] for circular_path in circular_paths: keynode = circular_path[0] x=[G[u][v][0]['sek'] for u,v in zip(circular_path, circular_path[1:])] #collect edges y=[G.node[node]['sek'] for node in circular_path] #collect nodes sm = reduce(lambda x,y:x+y, [s for s in list(itertools.chain.from_iterable(itertools.izip_longest(y,x)))[:-1]]) incoming = [(n, keynode) for n in G.predecessors(keynode) if n not in circular_path] outgoing = [(keynode, n) for n in G.successors(keynode) if n not in circular_path] new_node = seguid(sm.seq) G.add_node(new_node, sek=sm) G.add_edges_from( [(p[0], new_node, {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in incoming] ) G.add_edges_from( [(new_node, p[1], {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in outgoing] ) linear_products=[] unique_lengths=set() for path in all_simple_paths_edges(G, '5', '3', data=True): result = Dseqrecord("") for first_node, second_node, edgedict in path: e=edgedict.values().pop()['sek'] if e.seq.watson == len(e.seq.watson)*"-": result+=G.node[second_node]['sek'][len(e):] else: result+=e result+=G.node[second_node]['sek'] # for first_node, second_node, edgedict in path: # result+=edgedict.values().pop()['sek'] # result+=G.node[second_node]['sek'] assert result.circular == False linear_products.append(result) unique_lengths.add(len(result)) unique_linear_products=linear_products[:] for le in unique_lengths: lst = [se for se in linear_products if len(se)==le] a = lst.pop() for b in lst: if ( str(a.seq).lower() == str(b.seq).lower() or str(a.seq).lower() == str(b.seq.reverse_complement()).lower()): unique_linear_products.remove(b) unique_linear_products.sort(key=len, reverse=True) for lp in unique_linear_products: lp.description = "linear assembly product {}".format(len(lp)) osf = [feature for feature in lp.features if "from_left" in feature.qualifiers] lp.features = [feature for feature in lp.features if not "from_left" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["from_left"] if feature.strand == 1 and str(seq).lower()==str(lp[feature.location.start:feature.location.start+len(seq)].seq).lower(): begin = feature.location.start end = feature.location.start+len(seq) elif feature.strand == -1 and str(seq).lower()==str(rc(lp[feature.location.start:feature.location.start+len(seq)].seq)).lower(): begin = feature.location.start end = feature.location.start+len(seq) else: continue newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="from_left"}, sub_features=None,) lp.features.append(newf) osf = [feature for feature in lp.features if "to_right" in feature.qualifiers] lp.features = [feature for feature in lp.features if not "to_right" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["to_right"] if feature.strand == 1 and str(seq).lower()==str(lp[feature.location.start-len(seq):feature.location.start].seq).lower(): begin = feature.location.start-len(seq) end = feature.location.start elif feature.strand == -1 and str(seq).lower()==str(rc(lp[feature.location.start-len(seq):feature.location.start].seq)).lower(): begin = feature.location.start-len(seq) end = feature.location.start else: continue newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="to_right"}, sub_features=None,) lp.features.append(newf) return frecs, unique_linear_products
def _make_graph(recs, limit=25): form_rec_list=list(copy.deepcopy(recs)) for frec in form_rec_list: frec.features = [f for f in frec.features if f.type!="overlap"] frec.seq = frec.seq.fill_in() # !!! rc = { frec : frec.rc() for frec in form_rec_list } G=nx.MultiDiGraph( multiedges=True, selfloops=False) G.add_node( "5", sek=Dseqrecord("")) G.add_node( "3", sek=Dseqrecord("")) matches=[] for a, b in itertools.combinations(form_rec_list, 2): match = common_sub_strings(str(a.seq).upper(), str(b.seq).upper(), limit) if match: matches.append((a, b, match)) match = common_sub_strings(str(a.seq).upper(), str(rc[b].seq).upper(), limit) if match: matches.append((a, rc[b], match)) matches.append((rc[a], b, [(len(a)-sa-le,len(b)-sb-le,le) for sa,sb,le in match])) for a, b, match in matches: for start_in_a, start_in_b, length in match: node_seq = a[start_in_a:start_in_a+length] node_seq2 = b[start_in_b:start_in_b+length] assert str(node_seq.seq).lower() == str(node_seq2.seq).lower() node_seq.features.extend(node_seq2.features) chksum = seguid(node_seq.seq) G.add_node(chksum, sek = node_seq) qual = {"note" : "olp_{}".format(chksum), "chksum" : chksum, "ApEinfo_fwdcolor" : "green", "ApEinfo_revcolor" : "red",} a.features.append( SeqFeature( FeatureLocation(start_in_a, start_in_a + length), type = "overlap", qualifiers = qual)) b.features.append( SeqFeature( FeatureLocation(start_in_b, start_in_b + length), type = "overlap", qualifiers = qual)) form_rec_list.extend(rc.values()) for frec in form_rec_list: overlaps = sorted({f.qualifiers["chksum"]:f for f in frec.features if f.type=="overlap"}.values(), key = operator.attrgetter("location.start")) if overlaps: overlaps = ([SeqFeature(FeatureLocation(0,0), type = "overlap", qualifiers = {"chksum":"5"})]+ overlaps+ [SeqFeature(FeatureLocation(len(frec),len(frec)), type = "overlap", qualifiers = {"chksum":"3"})]) for olp1, olp2 in zip(overlaps, overlaps[1:]): n1 = olp1.qualifiers["chksum"] n2 = olp2.qualifiers["chksum"] start, end = olp1.location.end, olp2.location.start sek = frec[start:end] if start>end: sek=Dseqrecord((start-end)*"-") # tandem overlaps overlap with each other ! for feature in frec.features: if start<feature.location.end<end and feature.location.start<start: newf = SeqFeature(FeatureLocation(feature.location.end-start, feature.location.end-start), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None,) newf.qualifiers['to_right'] = feature.extract(frec).seq sek.features.append(newf) if start<feature.location.start<end and feature.location.end>end: newf = SeqFeature(FeatureLocation(feature.location.start-start, feature.location.start-start), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None,) newf.qualifiers['from_left'] = feature.extract(frec).seq sek.features.append(newf) G.add_edge(n1, n2, sek=sek) return form_rec_list, G if __name__=="__main__": pass