Source code for pydna.assembly2

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import copy
import itertools
import networkx as nx
import operator
import random
from collections import defaultdict
from collections import namedtuple

from Bio.SeqFeature import ExactPosition
from Bio.SeqFeature import FeatureLocation
from Bio.SeqFeature import SeqFeature

from pydna import Dseq
from pydna import Dseqrecord
from pydna._simple_paths7 import all_circular_paths_edges
from pydna._simple_paths8 import all_simple_paths_edges
from findsubstrings_suffix_arrays_python import common_sub_strings
from findterminaloverlap_indexof_python import terminal_overlap

def _add_tr_features(dsrecs, limit):
    #dsrecs=list(copy.deepcopy(dsrecs))
    cols = {}
    for dsrec in dsrecs:
        dsrec.features = [f for f in dsrec.features if f.type!="overlap"]
        dsrec.seq = dsrec.seq.fill_in()
    rcs = {dsrec:dsrec.rc() for dsrec in dsrecs}
    matches=[]
    dsset=set()

    for a, b in itertools.permutations(dsrecs, 2):
        match = terminal_overlap(str(a.seq).lower(), 
                                 str(b.seq).lower(), 
                                 limit)
        if match:
            matches.append((a, b, match))
            dsset.add(a)
            dsset.add(b)

        match = common_sub_strings(str(a.seq).lower(),
                                   str(rcs[b].seq).lower(),
                                                limit)
        if match:
            matches.append((a, rcs[b], match))
            dsset.add(a)
            dsset.add(rcs[b])

    for a,b,match in matches: 
        for start_in_a, start_in_b, length in match:
    
            chksum = a[start_in_a:start_in_a+length].seguid()
            assert chksum == b[start_in_b:start_in_b+length].seguid()
            
            try:
                fcol, revcol = cols[chksum]
            except KeyError:
                fcol = '#%02X%02X%02X' % (random.randint(175,255),random.randint(175,255),random.randint(175,255))
                rcol = '#%02X%02X%02X' % (random.randint(175,255),random.randint(175,255),random.randint(175,255))
                cols[chksum] = fcol,rcol
            
            qual      = {"note"             : ["olp_{}".format(chksum)],
                         "chksum"           : [chksum],
                         "ApEinfo_fwdcolor" : [fcol],
                         "ApEinfo_revcolor" : [rcol]}
            
            if not chksum in [f.qualifiers["chksum"][0] for f in a.features if f.type == "overlap"]:            
                a.features.append( SeqFeature( FeatureLocation(start_in_a,
                                                               start_in_a + length),
                                                               type = "overlap",
                                                               qualifiers = qual))
            if not chksum in [f.qualifiers["chksum"][0] for f in b.features if f.type == "overlap"]:
                b.features.append( SeqFeature( FeatureLocation(start_in_b,
                                                               start_in_b + length),
                                                               type = "overlap",
                                                               qualifiers = qual))
    for ds in dsset:
        ds.features = sorted([f for f in ds.features], key = operator.attrgetter("location.start")) 
    return list(dsset)
    
    

def _add_hr_features(dsrecs, limit):    
    #dsrecs=list(copy.deepcopy(dsrecs))
    cols = {}
    for dsrec in dsrecs:
        dsrec.features = [f for f in dsrec.features if f.type!="overlap"]
        dsrec.seq = dsrec.seq.fill_in()
    #rcs = [dsrec.rc() for dsrec in dsrecs]
    rcs = {dsrec:dsrec.rc() for dsrec in dsrecs}
    matches=[]
    dsset=set()

    for a, b in itertools.combinations(dsrecs, 2):
        match = common_sub_strings(str(a.seq).upper(),
                                   str(b.seq).upper(),
                                                limit)
        if match:
            matches.append((a, b, match))
            dsset.add(a)
            dsset.add(b)
            
        match = common_sub_strings(str(a.seq).upper(),
                                   str(rcs[b].seq).upper(),
                                                limit)
        if match:
            matches.append((a, rcs[b], match))
            dsset.add(a)
            dsset.add(rcs[b])

#    for a in set(dsrecs) - dsset:
#        for rc in rcs:
#            match = common_sub_strings(str(a.seq).upper(),
#                                       str(rc.seq).upper(),
#                                       limit)
#            if match:
#                matches.append((a, rc, match))
#                #matches.append((a.rc(), b, [(len(a)-sa-le, len(b)-sb-le, le) for sa,sb,le in match]))

    for a,b,match in matches: 
        for start_in_a, start_in_b, length in match:
#            dsset.add(a)
#            dsset.add(b)
            chksum = a[start_in_a:start_in_a+length].seguid()
            assert chksum == b[start_in_b:start_in_b+length].seguid()
            
            try:
                fcol, revcol = cols[chksum]
            except KeyError:
                fcol = '#%02X%02X%02X' % (random.randint(175,255),random.randint(175,255),random.randint(175,255))
                rcol = '#%02X%02X%02X' % (random.randint(175,255),random.randint(175,255),random.randint(175,255))
                cols[chksum] = fcol,rcol
            
            qual      = {"note"             : ["olp_{}".format(chksum)],
                         "chksum"           : [chksum],
                         "ApEinfo_fwdcolor" : [fcol],
                         "ApEinfo_revcolor" : [rcol]}
            
            if not chksum in [f.qualifiers["chksum"][0] for f in a.features if f.type == "overlap"]:            
                a.features.append( SeqFeature( FeatureLocation(start_in_a,
                                                               start_in_a + length),
                                                               type = "overlap",
                                                               qualifiers = qual))
            if not chksum in [f.qualifiers["chksum"][0] for f in b.features if f.type == "overlap"]:
                b.features.append( SeqFeature( FeatureLocation(start_in_b,
                                                               start_in_b + length),
                                                               type = "overlap",
                                                               qualifiers = qual))
    for ds in dsset:
        ds.features = sorted([f for f in ds.features], key = operator.attrgetter("location.start"))          

    return list(dsset)
    
    
    
    
    
    
    
    

def _create_graph(dsrecs):
    
    G=nx.MultiDiGraph( multiedges=True, selfloops=False)
    G.add_node( '5' ) #, sek=Dseqrecord(''))
    G.add_node( '3' ) #, sek=Dseqrecord(''))
    
    for dsrec in dsrecs:
        overlaps = sorted( {f.qualifiers['chksum'][0]:f for f in dsrec.features 
                            if f.type=='overlap'}.values(),
                           key = operator.attrgetter('location.start'))          

        if overlaps:
            overlaps = ([SeqFeature(FeatureLocation(0,0),
                         type = 'overlap',
                         qualifiers = {'chksum':['5']})]+
                         overlaps+
                        [SeqFeature(FeatureLocation(len(dsrec),len(dsrec)),
                                    type = 'overlap',
                                    qualifiers = {'chksum':['3']})])
            
            for olp1, olp2 in itertools.combinations(overlaps, 2):
                
                n1 = olp1.qualifiers['chksum'][0]
                n2 = olp2.qualifiers['chksum'][0]
                
                if n1 == '5' and n2=='3':
                    continue   
                
                s1,e1,s2,e2 = (olp1.location.start.position,
                               olp1.location.end.position,
                               olp2.location.start.position,
                               olp2.location.end.position,)
                
                #start, end = olp1.location.end, olp2.location.start
                
                #sek = dsrec[start:end]
                #sek = dsrec[e1:s2]
                
                #begin, stop = int(olp1.location.start), int(olp2.location.end)
                
                #processed_sek = dsrec[begin:stop]
                processed_sek = dsrec[s1:e2]
                    
                processed_sek = Dseqrecord(Dseq(
                                watson = processed_sek.seq.watson[:len(processed_sek.seq.watson)-len(olp2)],
                                crick  = processed_sek.seq.crick[:len(processed_sek.seq.crick)-len(olp1)],
                                ovhg   = -len(olp1)))
                
                #if start>end:
#                if e1>s2:
#                    sek=Dseqrecord((e1-s2)*'-') # tandem overlaps overlap with each other !
                        
                G.add_edge(n1, n2, #sek=sek, 
                                   dsrec=dsrec, 
                                   processed_sek = processed_sek, 
                                   start1=s1,
                                   end1=e1,
                                   start2=s2,
                                   end2=e2)
#                G.node[n1]['sek'] = olp1.extract(dsrec)
#                G.node[n2]['sek'] = olp2.extract(dsrec)        
    return G


def _recombination_assembly(dsrecs, homology_alg, limit):
    
    assembly = namedtuple('assembly', 'result source_fragments sticky_fragments source_offsets sticky_offsets overlap_sizes')
    
    dsrecs = homology_alg(dsrecs, limit)
    
    max_nodes = len(dsrecs)

#    for d in dsrecs:
#        import time
#        ape(d)
#        time.sleep(0.5)
#    import sys;sys.exit()
    
    lG = _create_graph(dsrecs)    
    cG = lG.copy()    
    cG.remove_nodes_from(('5','3'))
    circular_products=defaultdict(list)
    
#    all_circular_paths_edges_=[]                                        #
#    for path in all_circular_paths_edges(cG):                           #
#        all_circular_paths_edges_.append(path)                          #
#    all_circular_paths_edges_=[all_circular_paths_edges_[1],]           #

    for path in all_circular_paths_edges(cG):     

        first_node, second_node, edgedict = path[0]
        sticky_offsets = [0, edgedict['start2']-edgedict['start1']]
        source_offsets = [0, edgedict['start2'], ]
        source_seqs=[edgedict['dsrec'],]
        sticky_source_seqs=[edgedict['processed_sek'],]
        #esek=edgedict['sek']
        overlap_sizes = [0, edgedict['end2']-edgedict['start2']]
        
        #if esek.seq.watson == len(esek.seq.watson)*'-':
        if edgedict['start2']<edgedict['end1']:
            #result=cG.node[second_node]['sek'][len(esek):]
            #result=edgedict['dsrec'][edgedict['start1']:edgedict['start1']+len(esek)]
            result=edgedict['dsrec'][edgedict['start2']+(edgedict['end1']-edgedict['start2']):edgedict['end2']]
        else:
            #result=esek+cG.node[second_node]['sek']
            result=edgedict['dsrec'][edgedict['end1']:edgedict['end2']]
        
        for first_node, second_node, edgedict in path[1:]:            
            #esek=edgedict['sek']
            source_seqs.append(edgedict['dsrec'])
            sticky_source_seqs.append(edgedict['processed_sek'])
            overlap_sizes.append(edgedict['end2']-edgedict['start2'])
            #if esek.seq.watson == len(esek.seq.watson)*'-':
            if edgedict['start2']<edgedict['end1']:
                #result+=cG.node[second_node]['sek'][len(esek):]
                #print cG.node[second_node]['sek'][len(esek):]
                result+=edgedict['dsrec'][edgedict['start2']+(edgedict['end1']-edgedict['start2']):edgedict['end2']]                
                #raw_input("zzz")
            else:
                #result+=e
                #result+=cG.node[second_node]['sek']
                result+=edgedict['dsrec'][edgedict['end1']:edgedict['end2']]

            #sticky_offsets.append( len(result) )
            #source_offsets.append( sticky_offsets[-2] - edgedict['start1'] + hej )
            
            sticky_offsets.append( sticky_offsets[-1] + edgedict['start2'] - edgedict['start1'] )
            source_offsets[-1] -= edgedict['start1']
            source_offsets.append( source_offsets[-1] + edgedict['start2'] )
            
        add=True
        for cp in circular_products[len(result)]:
            if (str(result.seq).lower() in str(cp.result.seq).lower()*2
                or
                str(result.seq).lower() == str(cp.result.seq.reverse_complement()).lower()*2):
                add=False                
        if add:
            circular_products[len(result)].append( assembly(Dseqrecord(result, circular = True), source_seqs, sticky_source_seqs, source_offsets[:-1], sticky_offsets[:-1], overlap_sizes) )

    circular_products = list(itertools.chain.from_iterable(circular_products[size] for size in sorted(circular_products, reverse=True)))
    
    linear_products=defaultdict(list)    
    pw = []
    sticky_source_seqs=[]
    
#    for d in dsrecs:
#        import time
#        ape(d)
#        time.sleep(0.5)
#    import sys;sys.exit()

    
    for path in all_simple_paths_edges(lG, '5', '3', data=True, cutoff=max_nodes):
#        sticky_offsets = [0]
#        source_offsets = []
#        source_seqs=[]
#        sticky_source_seqs=[]
#        result = Dseqrecord('')
        
        
        first_node, second_node, edgedict = path[0]
        edgedict = edgedict.values().pop()
        #print first_node, second_node, edgedict
        sticky_offsets = [0, edgedict['start2']-edgedict['start1']]
        source_offsets = [0, edgedict['start2'], ]
        source_seqs=[edgedict['dsrec'],]
        sticky_source_seqs=[edgedict['processed_sek'],]
        #esek=edgedict['sek']
        overlap_sizes = [0, edgedict['end2']-edgedict['start2']]
        
        #if esek.seq.watson == len(esek.seq.watson)*'-':
        if edgedict['start2']<edgedict['end1']:
            #result=cG.node[second_node]['sek'][len(esek):]
            result=edgedict['dsrec'][edgedict['start2']+(edgedict['end1']-edgedict['start2']):edgedict['end2']]
        else:
            #result=esek+cG.node[second_node]['sek']
            result=edgedict['dsrec'][edgedict['end1']:edgedict['end2']]
            
        for first_node, second_node, ed in path[1:]:
            
            edgedict=ed.values().pop()
            #e=edgedict['sek']
            source_seqs.append(edgedict['dsrec'])
            sticky_source_seqs.append(edgedict['processed_sek'])
            
            #if e.seq.watson == len(e.seq.watson)*'-':
            if edgedict['start2']<edgedict['end1']:
                #result+=lG.node[second_node]['sek'][len(e):]
                result+=edgedict['dsrec'][edgedict['start2']+(edgedict['end1']-edgedict['start2']):edgedict['end2']] 
            else:
                #result+=e
                #result+=lG.node[second_node]['sek']
                result+=edgedict['dsrec'][edgedict['end1']:edgedict['end2']]
                
#            sticky_offsets.append( len(result) - len(lG.node[second_node]['sek']))
#            source_offsets.append( sticky_offsets[-2] - h['start1'])
            sticky_offsets.append( sticky_offsets[-1] + edgedict['start2'] - edgedict['start1'] )
            source_offsets[-1] -= edgedict['start1']
            source_offsets.append( source_offsets[-1] + edgedict['start2'] )
            overlap_sizes.append(edgedict['end2']-edgedict['start2'])
            

        add=True
        for lp in linear_products[len(result)]:
            if (str(result.seq).lower() == str(lp.result.seq).lower()
                or
                str(result.seq).lower() == str(lp.result.seq.reverse_complement()).lower()):
                add=False
        for dsrec in dsrecs:
            if (str(result.seq).lower() == str(dsrec.seq).lower()
                or
                str(result.seq).lower() == str(dsrec.seq.reverse_complement()).lower()):
                add=False
        if add:
            linear_products[len(result)].append(assembly( Dseqrecord(result, linear = True), source_seqs, sticky_source_seqs, source_offsets[:-1], sticky_offsets[:-1], overlap_sizes[:-1]) )

    linear_products = list(itertools.chain.from_iterable(linear_products[size] for size in sorted(linear_products, reverse=True)))
    
    products = namedtuple('products', 'linear circular')
    
    return products(linear_products, circular_products)
    
[docs]def gibson_assembly(dsrecs, limit=15): '''Accepts a list of Dseqrecords and tries to assemble them into linear or circular assemblies based on shared regions of homology at the extremities of each fragment with a minimum length given by limit. This is a simulation of the DNA assembly method described in: Gibson DG, Young L, Chuang R-Y, Venter JC, Hutchison CA 3rd, Smith HO (2009) Enzymatic assembly of DNA molecules up to several hundred kilobases. Nat Methods 6:343–345. doi: 10.1038/nmeth.1318 Parameters ---------- dsrecs : list a list of Dseqrecord objects. limit : int, optional limit is set to 15 nucleotides by default. Returns ------- linear, circular : tuple Linear and circular are two lists containing linear and circular assemblies. Each object in linear or circular is a named tuple with the following fields: +------------------+---------------------+----------------------------+ | Field | Type | Contains | +==================+=====================+============================+ | result | Dseqrecord | Assembeled sequence | +------------------+---------------------+----------------------------+ | fragments | list of Dseqrecords | list of fragments | +------------------+---------------------+----------------------------+ | offsets | list of integers | list of cumulative offsets | | | | aligning each source | | | | fragment | +------------------+------------+--------+----------------------------+ | overlap_sizes | list of integers | list of the length of each | | | | overlap joining the | | | | sequences | +------------------+---------------------+----------------------------+ Source fragments are the double stranded blunt DNA fragments originally added as the dsrecs argument. The parwise overlaps found are added ar features to the source fragments. Offsets is a list with the cumulative stagger between each fragment. :: <--6-> <--6-> overlap_sizes = [6,6] cggcggcgggcc \\ gccgccgcccggACGGAG | | TGCCTCaccattgc | sticky_fragments = [12,26] tggtaacgTTTTTT | | AAAAAAcatcata | gtagtat / <----------> <------------------------> offsets ''' pl, pc = _recombination_assembly(dsrecs, _add_tr_features, limit) if not pl or pc: return [], [] products = namedtuple('products', 'linear circular') assembly = namedtuple('assembly', 'result fragments offsets overlap_sizes') linear_results=[] circular_results=[] for a in pl: linear_results.append( assembly(result = a.result, fragments = a.sticky_fragments, offsets = a.sticky_offsets, overlap_sizes = a.overlap_sizes)) for a in pc: circular_results.append( assembly(result = a.result, fragments = a.sticky_fragments, offsets = a.sticky_offsets, overlap_sizes = a.overlap_sizes)) return products(linear=linear_results, circular=circular_results)
[docs]def recombination_assembly(dsrecs, limit=25): '''Accepts a list of Dseqrecords and tries to assemble them into linear and circular assemblies based on shared regions of homology with a minimum length given by limit. Parameters ---------- dsrecs : list a list of Dseqrecord objects. limit : int, optional limit is set to 25 nucleotides by default. Returns ------- linear, circular: tuple Linear and circular are two lists containing linear and circular assemblies. Each object in linear or circular is a named tuple with the following fields +------------------+---------------------+----------------------------+ | Field | Type | Contains | +==================+=====================+============================+ | result | Dseqrecord | Assembeled sequence | +------------------+---------------------+----------------------------+ | source_fragments | list of Dseqrecords | list of fragments | +------------------+---------------------+----------------------------+ | sticky_fragments | list of Dseqrecords | list of processed fragments| | | | with a single stranded 5' | | | | cohesive end | +------------------+---------------------+----------------------------+ | source_offsets | list of integers | list of cumulative offsets | | | | aligning each source | | | | fragment | +------------------+------------+--------+----------------------------+ | sticky_offsets | list of integers | list of cumulative offsets | | | | aligning each sticky | | | | fragment | +------------------+------------+--------+----------------------------+ | overlap_sizes | list of integers | list of the length of each | | | | overlap joining the | | | | sequences | +------------------+---------------------+----------------------------+ Source fragments are the double stranded blunt DNA fragments originally added as the dsrecs argument. The parwise overlaps found are added ar features to the source fragments. Sticky fragments are source fragments that have been processed so that they can be ligated together by the homologous repair DNA machinery (single-strand annealing pathway). This involves trimming DNA fragments so that the fragments are flanked by a 5' single stranded overhang. Source_offsets is a list with the cumulative stagger between each source fragment. Source offsets are different from sticky offsets when the overlapping sequences are located in the interior of the fragments. :: <--------> <---------------------> source_offsets = [10,23] cggcggcgggccTGCCTCtc \\ gccgccgcccggACGGAGag | | taTGCCTCaccattgcAAAAAAtt | source_fragments atACGGAGtggtaacgTTTTTTaa | | aatAAAAAAcatcata | ttaTTTTTTgtagtat / <--6-> <--6-> overlap_sizes = [6,6] cggcggcgggcc \\ gccgccgcccggACGGAG | | TGCCTCaccattgc | sticky_fragments = [12,26] tggtaacgTTTTTT | | AAAAAAcatcata | gtagtat / <----------> <------------------------> sticky_offsets ''' return _recombination_assembly(dsrecs, _add_hr_features, limit)
[docs]def fusion_pcr_assembly(dsrecs, limit=15): '''Accepts a list of Dseqrecords and tries to assemble them into linear assemblies based on shared regions of homology at the extremities of each fragment with a minimum length given by limit. Parameters ---------- dsrecs : list a list of Dseqrecord objects. limit : int, optional limit is set to 15 nucleotides by default. Returns ------- linear: list Each object in linear is a named tuple with the following fields: +------------------+---------------------+----------------------------+ | Field | Type | Contains | +==================+=====================+============================+ | result | Dseqrecord | Assembeled sequence | +------------------+---------------------+----------------------------+ | fragments | list of Dseqrecords | list of fragments | +------------------+---------------------+----------------------------+ | offsets | list of integers | list of cumulative offsets | | | | aligning each source | | | | fragment | +------------------+------------+--------+----------------------------+ | overlap_sizes | list of integers | list of the length of each | | | | overlap joining the | | | | sequences | +------------------+---------------------+----------------------------+ Source fragments are the double stranded blunt DNA fragments originally added as the dsrecs argument. The parwise overlaps found are added ar features to the source fragments. Offsets is a list with the cumulative stagger between each fragment. :: <--6-> <--6-> overlap_sizes = [6,6] cggcggcgggccTGCCTC \\ gccgccgcccggACGGAG | | TGCCTCaccattgcAAAAAA | fragments ACGGAGtggtaacgTTTTTT | | AAAAAAcatcata | TTTTTTgtagtat / <----------> <-----------------------> offsets = [12,26] ''' p = _recombination_assembly(dsrecs, _add_tr_features, limit).linear if not p: return [] assembly = namedtuple('assembly', 'result fragments offsets overlap_sizes') results=[] for a in p: results.append( assembly(result = a.result, fragments = a.source_fragments, offsets = a.source_offsets, overlap_sizes = a.overlap_sizes)) return results
if __name__=="__main__": from pydna_helper import ape from pydna import Dseqrecord, parse, eq a=Dseqrecord( "atgatcagagTATCTATctTGCCTCATATAAA") b=Dseqrecord( "TGCCTCATATAAAaccaagcatctattagacgtgtatcta") prods = fusion_pcr_assembly( (a,b), 13) for seq, spc, olp in zip(prods[0].fragments, prods[0].offsets, prods[0].overlap_sizes): print " "*spc+"|"*olp print " "*spc+str(seq.seq) # for d in (a,b): # import time # ape(d) # time.sleep(0.5) # import sys;sys.exit() a=Dseqrecord( "atgatcagagTATCTATctTGCCTCATATAAA") b=Dseqrecord( "tagatacacgtctaatagatgcttggtTTTATATGAGGCA") prods = fusion_pcr_assembly( (a,b), 13) for seq, spc, olp in zip(prods[0].fragments, prods[0].offsets, prods[0].overlap_sizes): print " "*spc+"|"*olp print " "*spc+str(seq.seq) a=Dseqrecord( "cggcggcgggccTGCCTC") b=Dseqrecord( "TGCCTCaccattgcAAAAAA") c=Dseqrecord( "AAAAAAcatcata" ) prods = gibson_assembly( (a,b,c), 6).linear for seq, spc, olp in zip(prods[0].fragments, prods[0].offsets, prods[0].overlap_sizes): print " "*spc+"|"*olp print " "*spc+str(seq.seq) a=Dseqrecord( "atgatcagagTATCTATctTGCCTCATATAAAc") b=Dseqrecord( "gTGCCTCATATAAAaccaagcatctattagacgtgtatcta") prods = recombination_assembly( (a,b), 13) for seq, spc, olp in zip(prods.linear[0].sticky_fragments, prods.linear[0].sticky_offsets, prods.linear[0].overlap_sizes): print " "*spc+"|"*olp print " "*spc+str(seq.seq) for seq, spc, spc2, olp in zip(prods.linear[0].source_fragments, prods.linear[0].source_offsets, prods.linear[0].sticky_offsets, prods.linear[0].overlap_sizes): print " "*spc2+"|"*olp print " "*spc+str(seq.seq) a=Dseqrecord( "atgatcagagTATCTATctTGCCTCATATAAAc") b=Dseqrecord( "tagatacacgtctaatagatgcttggtTTTATATGAGGCAc") prods = recombination_assembly( (a,b), 13) for seq, spc, olp in zip(prods.linear[0].sticky_fragments, prods.linear[0].sticky_offsets, prods.linear[0].overlap_sizes): print " "*spc+"|"*olp print " "*spc+str(seq.seq) for seq, spc, spc2, olp in zip(prods.linear[0].source_fragments, prods.linear[0].source_offsets, prods.linear[0].sticky_offsets, prods.linear[0].overlap_sizes): print " "*spc2+"|"*olp print " "*spc+str(seq.seq) print prods.__class__ print prods.linear[0].overlap_sizes;raw_input("!!!") a=Dseqrecord( "cggcggcgggccTGCCTCtc") b=Dseqrecord( "aTGCCTCaccattgcAAAAAAtt") c=Dseqrecord( "tAAAAAAcatcata" ) prods = recombination_assembly( (a,b,c), 6) for seq, spc in zip(prods.linear[0].sticky_fragments, prods.linear[0].sticky_offsets): print " "*spc+str(seq.seq) for seq, spc in zip(prods.linear[0].source_fragments, prods.linear[0].source_offsets): print " "*spc+str(seq.seq) assert prods.linear[0].result.seq.tostring() == "cggcggcgggccTGCCTCaccattgcAAAAAAcatcata" print prods.linear[0].overlap_sizes;raw_input("!!!") a=Dseqrecord( "TATATCATATAAAnnGGGCGCGGGCGGC") b=Dseqrecord( "GGGCGCGGGCGGCNNNTATATCATATAAA") prods = gibson_assembly( (a,b), 13) assert eq( prods.circular[0].result, Dseqrecord("TATATCATATAAAnnGGGCGCGGGCGGCNNN", circular=True) ) print "------------\n" print for seq, spc in zip(prods.circular[0].sticky_fragments, prods.circular[0].sticky_offsets): print " "*spc+str(seq.seq) for seq, spc in zip(prods.circular[0].source_fragments, prods.circular[0].source_offsets): print " "*spc+str(seq.seq) print prods.circular[0].overlap_sizes;raw_input("!!!") a=Dseqrecord( "aaTATATCATATAAAnnGGGCGCGGGCGGCtt") b=Dseqrecord( "ccGGGCGCGGGCGGCNNNTATATCATATAAAaa") prods = recombination_assembly( (a,b), 13) assert eq( prods.circular[0].result, Dseqrecord("TATATCATATAAAnnGGGCGCGGGCGGCNNN", circular=True) ) print "------------\n" print for seq, spc in zip(prods.circular[0].sticky_fragments, prods.circular[0].sticky_offsets): print " "*spc+str(seq.seq) for seq, spc in zip(prods.circular[0].source_fragments, prods.circular[0].source_offsets): print " "*spc+str(seq.seq) a=Dseqrecord( "nGGGGGGatgatcagagTATCTATctTGCCTCtc") b=Dseqrecord( "aTGCCTCaccatgcactTATCTATtgcAAAAAAtt") c=Dseqrecord( "tAAAAAAnncGGGGGGt" ) prods = recombination_assembly( (a,b,c), 6) assert prods.circular[0].result.seq.tostring().lower() == "accatgcactTATCTATtgcAAAAAAnncGGGGGGatgatcagagTATCTATctTGCCTC".lower() for seq, spc in zip(prods.circular[0].sticky_fragments, prods.circular[0].sticky_offsets): print " "*spc+str(seq.seq) for seq, spc in zip(prods.circular[0].source_fragments, prods.circular[0].source_offsets): print " "*spc+str(seq.seq) print prods.circular[0].overlap_sizes;raw_input("!!!") text5 = ''' >2577 tcctgacgggtaattttgatttgcatgccgtccgggtgagtcatagcgtctggttgttttgccagattcagcagagtctgtgcaatgcggccgctgacagagtcttttgtaacgaccccgtctccaccaacttggtatgcttgaaatctcaaggccattacacattcagttatgtgaacgaaaggtctttatttaacgtagcataaactaaataatacaggttccggttagcctgcaatgtgttaaatctaaaggagcatacccaaaatgaactgaagacaaggaaatttgcttgtccagatgtgattgagcatttgaacgttaataacataacatttttatacttaactatagaaagacttgtataaaaactggcaaacgagatattctgaatattggtgcatatttcaggtagaaaagcttacaaaacaatctaatcataatattgagatgaagagaaagataaaagaaaaaacgataagtcagatgagattatgattgtactttgaaatcgaggaacaaagtatatacggtagtagttccccgagttataacgggagatcatgtaaattgagaaaccagataaagatttggtatgcactctagcaagaaaataaaatgatgaatctatgatatagatcacttttgttccagcgtcgaggaacgccaggttgcccactttctcactagtgacctgcagccgacgatcagatctttcaggaaagtttcggaggagatagtgttcggcagtttgtacatcatctgcgggatcaggtacggtttgatcaggttgtagaagatcaggtaagacatagaatcgatgtagatgatcggtttgtttttgttgatttttacgtaacagttcagttggaatttgttacgcagacccttaaccaggtattctacttcttcgaaagtgaaagactgggtgttcagtacgatcgatttgttggtagagtttttgttgtaatcccatttaccaccatcatccatgaaccagtatgccagagacatcggggtcaggtagttttcaaccaggttgttcgggatggtttttttgttgttaacgatgaacaggttagccagtttgttgaaagcttggtgtttgaaagtctgggcgccccaggtgattaccaggttacccaggtggttaacacgttcttttttgtgcggcggggacagtacccactgatcgtacagcagacatacgtggtccatgtatgctttgtttttccactcgaactgcatacagtaggttttaccttcatcacgagaacggatgtaagcatcacccaggatcagaccgatacctgcttcgaactgttcgatgttcagttcgatcagctgggatttgtattctttcagcagtttagagttcggacccaggttcattacctggttttttttgatgtttttcatatgcatggatccggggttttttctccttgacgttaaagtatagaggtatattaacaattttttgttgatacttttattacatttgaataagaagtaatacaaaccgaaaatgttgaaagtattagttaaagtggttatgcagtttttgcatttatatatctgttaatagatcaaaaatcatcgcttcgctgattaattaccccagaaataaggctaaaaaactaatcgcattatcatcctatggttgttaatttgattcgttcatttgaaggtttgtggggccaggttactgccaatttttcctcttcataaccataaaagctagtattgtagaatctttattgttcggagcagtgcggcgcgaggcacatctgcgtttcaggaacgcgaccggtgaagacgaggacgcacggaggagagtcttccttcggagggctgtcacccgctcggcggcttctaatccgtacttcaatatagcaatgagcagttaagcgtattactgaaagttccaaagagaaggtttttttaggctaagataatggggctctttacatttccacaacatataagtaagattagatatggatatgtatatggatatgtatatggtggtaatgccatgtaatatgattattaaacttctttgcgtccatccaacgagatctggcgcgccttaattaacccaacctgcattaatgaatcggccaacgcgcggattaccctgttatccctacatattgttgtgccatctgtgcagacaaacgcatcaggattcagtactgacaataaaaagattcttgttttcaagaacttgtcatttgtatagtttttttatattgtagttgttctattttaatcaaatgttagcgtgatttatattttttttcgcctcgacatcatctgcccagatgcgaagttaagtgcgcagaaagtaatatcatgcgtcaatcgtatgtgaatgctggtcgctatactgctgtcgattcgatactaacgccgccatccagtgtcgaaaacgagctcgaattcatcgatgatatcagatccactagtggcctatgcggccgcggatctgccggtctccctatagtgagtcgatccggatttacctgaatcaattggcgaaattttttgtacgaaatttcagccacttcacag >2389 tcctgacgggtaattttgatttgcatgccgtccgggtgagtcatagcgtctggttgttttgccagattcagcagagtctgtgcaatgcggccgctgaccacatacgatttaggtgacactatagaacgcggccgccagctgaagcttcgtacgctgcaggtcgacggatccccgggttaattaaggcgcgccagatctgtttagcttgccttgtccccgccgggtcacccggccagcgacatggaggcccagaataccctccttgacagtcttgacgtgcgcagctcaggggcatgatgtgactgtcgcccgtacatttagcccatacatccccatgtataatcatttgcatccatacattttgatggccgcacggcgcgaagcaaaaattacggctcctcgctgcagacctgcgagcagggaaacgctcccctcacagacgcgttgaattgtccccacgccgcgcccctgtagagaaatataaaaggttaggatttgccactgaggttcttctttcatatacttccttttaaaatcttgctaggatacagttctcacatcacatccgaacataaacaaccgtcgaggaacgccaggttgcccactttctcactagtgacctgcagccgacccaatcacatcacatccgaacataaacaaccatgggtaaaaagcctgaactcaccgcgacgtctgtcgagaagtttctgatcgaaaagttcgacagcgtctccgacctgatgcagctctcggagggcgaagaatctcgtgctttcagcttcgatgtaggagggcgtggatatgtcctgcgggtaaatagctgcgccgatggtttctacaaagatcgttatgtttatcggcactttgcatcggccgcgctcccgattccggaagtgcttgacattggggaattcagcgagagcctgacctattgcatctcccgccgtgcacagggtgtcacgttgcaagacctgcctgaaaccgaactgcccgctgttctgcagccggtcgcggaggccatggatgcgatcgctgcggccgatcttagccagacgagcgggttcggcccattcggaccgcaaggaatcggtcaatacactacatggcgtgatttcatatgcgcgattgctgatccccatgtgtatcactggcaaactgtgatggacgacaccgtcagtgcgtccgtcgcgcaggctctcgatgagctgatgctttgggccgaggactgccccgaagtccggcacctcgtgcacgcggatttcggctccaacaatgtcctgacggacaatggccgcataacagcggtcattgactggagcgaggcgatgttcggggattcccaatacgaggtcgccaacatcttcttctggaggccgtggttggcttgtatggagcagcagacgcgctacttcgagcggaggcatccggagcttgcaggatcgccgcggctccgggcgtatatgctccgcattggtcttgaccaactctatcagagcttggttgacggcaatttcgatgatgcagcttgggcgcagggtcgatgcgacgcaatcgtccgatccggagccgggactgtcgggcgtacacaaatcgcccgcagaagcgcggccgtctggaccgatggctgtgtagaagtactcgccgatagtggaaaccgacgccccagcactcgtccgagggcaaaggaataatcagtactgacaataaaaagattcttgtagggataacagggtaatcggagtgccatctgtgcagacaaacgcatcaggatagagtcttttgtaacgaccccgtctccaccaacttggtatgcttgaaatctcaaggccattacacattcagttatgtgaacgaaaggtctttatttaacgtagcataaactaaataatacaggttccggttagcctgcaatgtgttaaatctaaaggagcatacccaaaatgaactgaagacaaggaaatttgcttgtccagatgtgattgagcatttgaacgttaataacataacatttttatacttaactatagaaagacttgtataaaaactggcaaacgagatattctgaatattggtgcatatttcaggtagaaaagcttacaaaacaatctaatcataatattgagatgaagagaaagataaaagaaaaaacgataagtcagatgagattatgattgtactttgaaatcgaggaacaaagtatatacggtagtagttccccgagttataacgggagatcatgtaaattgagaaaccagataaagatttggtatgcactctagcaagaaaataaaatgatgaatctatgatatagatcacttttgttccagcatccggatttacctgaatcaattggcgaaattttttgtacgaaatttcagccacttcacag >5681 atccggatttacctgaatcaattggcgaaattttttgtacgaaatttcagccacttcacaggcggttttcgcacgtacccatgcgctacgttcctggccctcttcaaacaggcccagttcgccaataaaatcaccctgattcagataggagaggatcatttctttaccctcttcgtctttgatcagcactgccacagagcctttaacgatgtagtacagcgtttccgctttttcaccctggtgaataagcgtgctcttggatgggtacttatgaatgtggcaatgagacaagaaccattcgagagtaggatccgtttgaggtttaccaagtaccataagatccttaaatttttattatctagctagatgataatattatatcaagaattgtacctgaaagcaaataaattttttatctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatatgcggtgtgaaataccgcacagatgcgtaaggagaaaataccgcatcaggcgctcttccgcttcctcgctcactgactcgctgcgctcggtcgttcggctgcggcgagcggtatcagctcactcaaaggcggtaatacggttatccacagaatcaggggataacgcaggaaagaacatgtgagcaaaaggccagcaaaagcccaggaaccgtaaaaaggccgcgttgctggcgtttttccataggctccgcccccctgacgagcatcacaaaaatcgacgctcaagtcagaggtggcgaaacccgacaggactataaagataccaggcgtttccccctggaagctccctcgtgcgctctcctgttccgaccctgccgcttaccggatacctgtccgcctttctcccttcgggaagcgtggcgctttctcatagctcacgctgtaggtatctcagttcggtgtaggtcgttcgctccaagctgggctgtgtgcacgaaccccccgttcagcccgaccgctgcgccttatccggtaactatcgtcttgagtccaacccggtaagacacgacttatcgccactggcagcagccactggtaacaggattagcagagcgaggtatgtaggcggtgctacagagttcttgaagtggtggcctaactacggctacactagaaggacagtatttggtatctgcgctctgctgaagccagttaccttcggaaaaagagttggtagctcttgatccggcaaacaaaccaccgctggtagcggtggtttttttgtttgcaagcagcagattacgcgcagaaaaaaaggatctcaagaagatcctttgatcttttctacggggtctgacgctcagtggaacgaaaactcacgttaagggattttggtcatgaggggtaataactgatataattaaattgaagctctaatttgtgagtttagtatacatgcatttacttataatacagttttttagttttgctggccgcatcttctcaaatatgcttcccagcctgcttttctgtaacgttcaccctctaccttagcatcccttccctttgcaaatagtcctcttccaacaataataatgtcagatcctgtagagaccacatcatccacggttctatactgttgacccaatgcgtctcccttgtcatctaaacccacaccgggtgtcataatcaaccaatcgtaaccttcatctcttccacccatgtctctttgagcaataaagccgataacaaaatctttgtcgctcttcgcaatgtcaacagtacccttagtatattctccagtagatagggagcccttgcatgacaattctgctaacatcaaaaggcctctaggttcctttgttacttcttctgccgcctgcttcaaaccgctaacaatacctgggcccaccacaccgtgtgcattcgtaatgtctgcccattctgctattctgtatacacccgcagagtactgcaatttgactgtattaccaatgtcagcaaattttctgtcttcgaagagtaaaaaattgtacttggcggataatgcctttagcggcttaactgtgccctccatggaaaaatcagtcaaaatatccacatgtgtttttagtaaacaaattttgggacctaatgcttcaactaactccagtaattccttggtggtacgaacatccaatgaagcacacaagtttgtttgcttttcgtgcatgatattaaatagcttggcagcaacaggactaggatgagtagcagcacgttccttatatgtagctttcgacatgatttatcttcgtttcctgcaggtttttgttctgtgcagttgggttaagaatactgggcaatttcatgtttcttcaacactacatatgcgtatatataccaatctaagtctgtgctccttccttcgttcttccttctgttcggagattaccgaatcaaaaaaatttcaaagaaaccgaaatcaaaaaaaagaataaaaaaaaaatgatgaattgaattgaaaagctagcttatcgatgataagctgtcaaagatgagaattaattccacggactatagactatactagatactccgtctactgtacgatacacttccgctcaggtccttgtcctttaacgaggccttaccactcttttgttactctattgatccagctcagcaaaggcagtgtgatctaagattctatcttcgcgatgtagtaaaactagctagaccgagaaagagactagaaatgcaaaaggcacttctacaatggctgccatcattattatccgatgtgacgctgcagcttctcaatgatattcgaatacgctttgaggagatacagcctaatatccgacaaactgttttacagatttacgatcgtacttgttacccatcattgaattttgaacatccgaacctgggagttttccctgaaacagatagtatatttgaacctgtataataatatatagtctagcgctttacggaagacaatgtatgtatttcggttcctggagaaactattgcatctattgcataggtaatcttgcacgtcgcatccccggttcattttctgcgtttccatcttgcacttcaatagcatatctttgttaacgaagcatctgtgcttcattttgtagaacaaaaatgcaacgcgagagcgctaatttttcaaacaaagaatctgagctgcatttttacagaacagaaatgcaacgcgaaagcgctattttaccaacgaagaatctgtgcttcatttttgtaaaacaaaaatgcaacgcgacgagagcgctaatttttcaaacaaagaatctgagctgcatttttacagaacagaaatgcaacgcgagagcgctattttaccaacaaagaatctatacttcttttttgttctacaaaaatgcatcccgagagcgctatttttctaacaaagcatcttagattactttttttctcctttgtgcgctctataatgcagtctcttgataactttttgcactgtaggtccgttaaggttagaagaaggctactttggtgtctattttctcttccataaaaaaagcctgactccacttcccgcgtttactgattactagcgaagctgcgggtgcattttttcaagataaaggcatccccgattatattctataccgatgtggattgcgcatactttgtgaacagaaagtgatagcgttgatgattcttcattggtcagaaaattatgaacggtttcttctattttgtctctatatactacgtataggaaatgtttacattttcgtattgttttcgattcactctatgaatagttcttactacaatttttttgtctaaagagtaatactagagataaacataaaaaatgtagaggtcgagtttagatgcaagttcaaggagcgaaaggtggatgggtaggttatatagggatatagcacagagatatatagcaaagagatacttttgagcaatgtttgtggaagcggtattcgcaatgggaagctccaccccggttgataatcagaaaagccccaaaaacaggaagattattatcaaaaaggatcttcacctagatccttttaaattaaaaatgaagttttaaatcaatctaaagtatatatgagtaaacttggtctgacagttaccaatgcttaatcagtgaggcacctatctcagcgatctgtctatttcgttcatccatagttgcctgactccccgtcgtgtagataactacgatacgggagcgcttaccatctggccccagtgctgcaatgataccgcgagacccacgctcaccggctccagatttatcagcaataaaccagccagccggaagggccgagcgcagaagtggtcctgcaactttatccgcctccatccagtctattaattgttgccgggaagctagagtaagtagttcgccagttaatagtttgcgcaacgttgttggcattgctacaggcatcgtggtgtcactctcgtcgtttggtatggcttcattcagctccggttcccaacgatcaaggcgagttacatgatcccccatgttgtgcaaaaaagcggttagctccttcggtcctccgatcgttgtcagaagtaagttggccgcagtgttatcactcatggttatggcagcactgcataattctcttactgtcatgccatccgtaagatgcttttctgtgactggtgagtactcaaccaagtcattctgagaatagtgtatgcggcgaccgagttgctcttgcccggcgtcaatacgggataatagtgtatcacatagcagaactttaaaagtgctcatcattggaaaacgttcttcggggcgaaaactctcaaggatcttaccgctgttgagatccagttcgatgtaacccactcgtgcacccaactgatcttcagcatcttttactttcaccagcgtttctgggtgagcaaaaacaggaaggcaaaatgccgcaaaaaagggaataagggcgacacggaaatgttgaatactcatactcttcctttttcaatattattgaagcatttatcagggttattgtctcatgagcggatacatatttgaatgtatttagaaaaataaacaaataggggttccgcgcacatttccccgaaaagtgccacctgctaagaaaccattattatcatgacattaacctataaaaataggcgtatcacgaggccctttcgtctcgcgcgtttcggtgatgacggtgaaaacctctgacacatgcagctcccggagacggtcacagcttgtctgtaagcggatgccgggagcagacaagcccgtcagggcgcgtcagcgggtgttggcgggtgtcggggctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatagatcctgaggatcggggtgataaatcagtctgcgccacatcgggggaaacaaaatggcgcgagatctaaaaaaaaaggctccaaaaggagcctttcgcgctaccaggtaacgcgccactccgacgggattaacgagtgccgtaaacgacgatggttttaccgtgtgcggagatcaggttctgatcctcgagcatcttaagaattcgtcccacggtttgtctagagcagccgacaatctggccaatttcctgacgggtaattttgatttgcatgccgtccgggtgagtcatagcgtctggttgttttgccagattcagcagagtctgtgcaatgcggccgctgac ''' lst = parse(text5) prods = recombination_assembly(lst, limit=60) correct = 'tcgcgcgtttcggtgatgacggtgaaaacctctgacacatgcagctcccggagacggtcacagcttgtctgtaagcggatgccgggagcagacaagcccgtcagggcgcgtcagcgggtgttggcgggtgtcggggctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatagatcctgaggatcggggtgataaatcagtctgcgccacatcgggggaaacaaaatggcgcgagatctaaaaaaaaaggctccaaaaggagcctttcgcgctaccaggtaacgcgccactccgacgggattaacgagtgccgtaaacgacgatggttttaccgtgtgcggagatcaggttctgatcctcgagcatcttaagaattcgtcccacggtttgtctagagcagccgacaatctggccaatttcctgacgggtaattttgatttgcatgccgtccgggtgagtcatagcgtctggttgttttgccagattcagcagagtctgtgcaatgcggccgctgaccacatacgatttaggtgacactatagaacgcggccgccagctgaagcttcgtacgctgcaggtcgacggatccccgggttaattaaggcgcgccagatctgtttagcttgccttgtccccgccgggtcacccggccagcgacatggaggcccagaataccctccttgacagtcttgacgtgcgcagctcaggggcatgatgtgactgtcgcccgtacatttagcccatacatccccatgtataatcatttgcatccatacattttgatggccgcacggcgcgaagcaaaaattacggctcctcgctgcagacctgcgagcagggaaacgctcccctcacagacgcgttgaattgtccccacgccgcgcccctgtagagaaatataaaaggttaggatttgccactgaggttcttctttcatatacttccttttaaaatcttgctaggatacagttctcacatcacatccgaacataaacaaccgtcgaggaacgccaggttgcccactttctcactagtgacctgcagccgacccaatcacatcacatccgaacataaacaaccatgggtaaaaagcctgaactcaccgcgacgtctgtcgagaagtttctgatcgaaaagttcgacagcgtctccgacctgatgcagctctcggagggcgaagaatctcgtgctttcagcttcgatgtaggagggcgtggatatgtcctgcgggtaaatagctgcgccgatggtttctacaaagatcgttatgtttatcggcactttgcatcggccgcgctcccgattccggaagtgcttgacattggggaattcagcgagagcctgacctattgcatctcccgccgtgcacagggtgtcacgttgcaagacctgcctgaaaccgaactgcccgctgttctgcagccggtcgcggaggccatggatgcgatcgctgcggccgatcttagccagacgagcgggttcggcccattcggaccgcaaggaatcggtcaatacactacatggcgtgatttcatatgcgcgattgctgatccccatgtgtatcactggcaaactgtgatggacgacaccgtcagtgcgtccgtcgcgcaggctctcgatgagctgatgctttgggccgaggactgccccgaagtccggcacctcgtgcacgcggatttcggctccaacaatgtcctgacggacaatggccgcataacagcggtcattgactggagcgaggcgatgttcggggattcccaatacgaggtcgccaacatcttcttctggaggccgtggttggcttgtatggagcagcagacgcgctacttcgagcggaggcatccggagcttgcaggatcgccgcggctccgggcgtatatgctccgcattggtcttgaccaactctatcagagcttggttgacggcaatttcgatgatgcagcttgggcgcagggtcgatgcgacgcaatcgtccgatccggagccgggactgtcgggcgtacacaaatcgcccgcagaagcgcggccgtctggaccgatggctgtgtagaagtactcgccgatagtggaaaccgacgccccagcactcgtccgagggcaaaggaataatcagtactgacaataaaaagattcttgtagggataacagggtaatcggagtgccatctgtgcagacaaacgcatcaggatagagtcttttgtaacgaccccgtctccaccaacttggtatgcttgaaatctcaaggccattacacattcagttatgtgaacgaaaggtctttatttaacgtagcataaactaaataatacaggttccggttagcctgcaatgtgttaaatctaaaggagcatacccaaaatgaactgaagacaaggaaatttgcttgtccagatgtgattgagcatttgaacgttaataacataacatttttatacttaactatagaaagacttgtataaaaactggcaaacgagatattctgaatattggtgcatatttcaggtagaaaagcttacaaaacaatctaatcataatattgagatgaagagaaagataaaagaaaaaacgataagtcagatgagattatgattgtactttgaaatcgaggaacaaagtatatacggtagtagttccccgagttataacgggagatcatgtaaattgagaaaccagataaagatttggtatgcactctagcaagaaaataaaatgatgaatctatgatatagatcacttttgttccagcgtcgaggaacgccaggttgcccactttctcactagtgacctgcagccgacgatcagatctttcaggaaagtttcggaggagatagtgttcggcagtttgtacatcatctgcgggatcaggtacggtttgatcaggttgtagaagatcaggtaagacatagaatcgatgtagatgatcggtttgtttttgttgatttttacgtaacagttcagttggaatttgttacgcagacccttaaccaggtattctacttcttcgaaagtgaaagactgggtgttcagtacgatcgatttgttggtagagtttttgttgtaatcccatttaccaccatcatccatgaaccagtatgccagagacatcggggtcaggtagttttcaaccaggttgttcgggatggtttttttgttgttaacgatgaacaggttagccagtttgttgaaagcttggtgtttgaaagtctgggcgccccaggtgattaccaggttacccaggtggttaacacgttcttttttgtgcggcggggacagtacccactgatcgtacagcagacatacgtggtccatgtatgctttgtttttccactcgaactgcatacagtaggttttaccttcatcacgagaacggatgtaagcatcacccaggatcagaccgatacctgcttcgaactgttcgatgttcagttcgatcagctgggatttgtattctttcagcagtttagagttcggacccaggttcattacctggttttttttgatgtttttcatatgcatggatccggggttttttctccttgacgttaaagtatagaggtatattaacaattttttgttgatacttttattacatttgaataagaagtaatacaaaccgaaaatgttgaaagtattagttaaagtggttatgcagtttttgcatttatatatctgttaatagatcaaaaatcatcgcttcgctgattaattaccccagaaataaggctaaaaaactaatcgcattatcatcctatggttgttaatttgattcgttcatttgaaggtttgtggggccaggttactgccaatttttcctcttcataaccataaaagctagtattgtagaatctttattgttcggagcagtgcggcgcgaggcacatctgcgtttcaggaacgcgaccggtgaagacgaggacgcacggaggagagtcttccttcggagggctgtcacccgctcggcggcttctaatccgtacttcaatatagcaatgagcagttaagcgtattactgaaagttccaaagagaaggtttttttaggctaagataatggggctctttacatttccacaacatataagtaagattagatatggatatgtatatggatatgtatatggtggtaatgccatgtaatatgattattaaacttctttgcgtccatccaacgagatctggcgcgccttaattaacccaacctgcattaatgaatcggccaacgcgcggattaccctgttatccctacatattgttgtgccatctgtgcagacaaacgcatcaggattcagtactgacaataaaaagattcttgttttcaagaacttgtcatttgtatagtttttttatattgtagttgttctattttaatcaaatgttagcgtgatttatattttttttcgcctcgacatcatctgcccagatgcgaagttaagtgcgcagaaagtaatatcatgcgtcaatcgtatgtgaatgctggtcgctatactgctgtcgattcgatactaacgccgccatccagtgtcgaaaacgagctcgaattcatcgatgatatcagatccactagtggcctatgcggccgcggatctgccggtctccctatagtgagtcgatccggatttacctgaatcaattggcgaaattttttgtacgaaatttcagccacttcacaggcggttttcgcacgtacccatgcgctacgttcctggccctcttcaaacaggcccagttcgccaataaaatcaccctgattcagataggagaggatcatttctttaccctcttcgtctttgatcagcactgccacagagcctttaacgatgtagtacagcgtttccgctttttcaccctggtgaataagcgtgctcttggatgggtacttatgaatgtggcaatgagacaagaaccattcgagagtaggatccgtttgaggtttaccaagtaccataagatccttaaatttttattatctagctagatgataatattatatcaagaattgtacctgaaagcaaataaattttttatctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatatgcggtgtgaaataccgcacagatgcgtaaggagaaaataccgcatcaggcgctcttccgcttcctcgctcactgactcgctgcgctcggtcgttcggctgcggcgagcggtatcagctcactcaaaggcggtaatacggttatccacagaatcaggggataacgcaggaaagaacatgtgagcaaaaggccagcaaaagcccaggaaccgtaaaaaggccgcgttgctggcgtttttccataggctccgcccccctgacgagcatcacaaaaatcgacgctcaagtcagaggtggcgaaacccgacaggactataaagataccaggcgtttccccctggaagctccctcgtgcgctctcctgttccgaccctgccgcttaccggatacctgtccgcctttctcccttcgggaagcgtggcgctttctcatagctcacgctgtaggtatctcagttcggtgtaggtcgttcgctccaagctgggctgtgtgcacgaaccccccgttcagcccgaccgctgcgccttatccggtaactatcgtcttgagtccaacccggtaagacacgacttatcgccactggcagcagccactggtaacaggattagcagagcgaggtatgtaggcggtgctacagagttcttgaagtggtggcctaactacggctacactagaaggacagtatttggtatctgcgctctgctgaagccagttaccttcggaaaaagagttggtagctcttgatccggcaaacaaaccaccgctggtagcggtggtttttttgtttgcaagcagcagattacgcgcagaaaaaaaggatctcaagaagatcctttgatcttttctacggggtctgacgctcagtggaacgaaaactcacgttaagggattttggtcatgaggggtaataactgatataattaaattgaagctctaatttgtgagtttagtatacatgcatttacttataatacagttttttagttttgctggccgcatcttctcaaatatgcttcccagcctgcttttctgtaacgttcaccctctaccttagcatcccttccctttgcaaatagtcctcttccaacaataataatgtcagatcctgtagagaccacatcatccacggttctatactgttgacccaatgcgtctcccttgtcatctaaacccacaccgggtgtcataatcaaccaatcgtaaccttcatctcttccacccatgtctctttgagcaataaagccgataacaaaatctttgtcgctcttcgcaatgtcaacagtacccttagtatattctccagtagatagggagcccttgcatgacaattctgctaacatcaaaaggcctctaggttcctttgttacttcttctgccgcctgcttcaaaccgctaacaatacctgggcccaccacaccgtgtgcattcgtaatgtctgcccattctgctattctgtatacacccgcagagtactgcaatttgactgtattaccaatgtcagcaaattttctgtcttcgaagagtaaaaaattgtacttggcggataatgcctttagcggcttaactgtgccctccatggaaaaatcagtcaaaatatccacatgtgtttttagtaaacaaattttgggacctaatgcttcaactaactccagtaattccttggtggtacgaacatccaatgaagcacacaagtttgtttgcttttcgtgcatgatattaaatagcttggcagcaacaggactaggatgagtagcagcacgttccttatatgtagctttcgacatgatttatcttcgtttcctgcaggtttttgttctgtgcagttgggttaagaatactgggcaatttcatgtttcttcaacactacatatgcgtatatataccaatctaagtctgtgctccttccttcgttcttccttctgttcggagattaccgaatcaaaaaaatttcaaagaaaccgaaatcaaaaaaaagaataaaaaaaaaatgatgaattgaattgaaaagctagcttatcgatgataagctgtcaaagatgagaattaattccacggactatagactatactagatactccgtctactgtacgatacacttccgctcaggtccttgtcctttaacgaggccttaccactcttttgttactctattgatccagctcagcaaaggcagtgtgatctaagattctatcttcgcgatgtagtaaaactagctagaccgagaaagagactagaaatgcaaaaggcacttctacaatggctgccatcattattatccgatgtgacgctgcagcttctcaatgatattcgaatacgctttgaggagatacagcctaatatccgacaaactgttttacagatttacgatcgtacttgttacccatcattgaattttgaacatccgaacctgggagttttccctgaaacagatagtatatttgaacctgtataataatatatagtctagcgctttacggaagacaatgtatgtatttcggttcctggagaaactattgcatctattgcataggtaatcttgcacgtcgcatccccggttcattttctgcgtttccatcttgcacttcaatagcatatctttgttaacgaagcatctgtgcttcattttgtagaacaaaaatgcaacgcgagagcgctaatttttcaaacaaagaatctgagctgcatttttacagaacagaaatgcaacgcgaaagcgctattttaccaacgaagaatctgtgcttcatttttgtaaaacaaaaatgcaacgcgacgagagcgctaatttttcaaacaaagaatctgagctgcatttttacagaacagaaatgcaacgcgagagcgctattttaccaacaaagaatctatacttcttttttgttctacaaaaatgcatcccgagagcgctatttttctaacaaagcatcttagattactttttttctcctttgtgcgctctataatgcagtctcttgataactttttgcactgtaggtccgttaaggttagaagaaggctactttggtgtctattttctcttccataaaaaaagcctgactccacttcccgcgtttactgattactagcgaagctgcgggtgcattttttcaagataaaggcatccccgattatattctataccgatgtggattgcgcatactttgtgaacagaaagtgatagcgttgatgattcttcattggtcagaaaattatgaacggtttcttctattttgtctctatatactacgtataggaaatgtttacattttcgtattgttttcgattcactctatgaatagttcttactacaatttttttgtctaaagagtaatactagagataaacataaaaaatgtagaggtcgagtttagatgcaagttcaaggagcgaaaggtggatgggtaggttatatagggatatagcacagagatatatagcaaagagatacttttgagcaatgtttgtggaagcggtattcgcaatgggaagctccaccccggttgataatcagaaaagccccaaaaacaggaagattattatcaaaaaggatcttcacctagatccttttaaattaaaaatgaagttttaaatcaatctaaagtatatatgagtaaacttggtctgacagttaccaatgcttaatcagtgaggcacctatctcagcgatctgtctatttcgttcatccatagttgcctgactccccgtcgtgtagataactacgatacgggagcgcttaccatctggccccagtgctgcaatgataccgcgagacccacgctcaccggctccagatttatcagcaataaaccagccagccggaagggccgagcgcagaagtggtcctgcaactttatccgcctccatccagtctattaattgttgccgggaagctagagtaagtagttcgccagttaatagtttgcgcaacgttgttggcattgctacaggcatcgtggtgtcactctcgtcgtttggtatggcttcattcagctccggttcccaacgatcaaggcgagttacatgatcccccatgttgtgcaaaaaagcggttagctccttcggtcctccgatcgttgtcagaagtaagttggccgcagtgttatcactcatggttatggcagcactgcataattctcttactgtcatgccatccgtaagatgcttttctgtgactggtgagtactcaaccaagtcattctgagaatagtgtatgcggcgaccgagttgctcttgcccggcgtcaatacgggataatagtgtatcacatagcagaactttaaaagtgctcatcattggaaaacgttcttcggggcgaaaactctcaaggatcttaccgctgttgagatccagttcgatgtaacccactcgtgcacccaactgatcttcagcatcttttactttcaccagcgtttctgggtgagcaaaaacaggaaggcaaaatgccgcaaaaaagggaataagggcgacacggaaatgttgaatactcatactcttcctttttcaatattattgaagcatttatcagggttattgtctcatgagcggatacatatttgaatgtatttagaaaaataaacaaataggggttccgcgcacatttccccgaaaagtgccacctgctaagaaaccattattatcatgacattaacctataaaaataggcgtatcacgaggccctttcgtc' assert eq(correct,prods.circular[0].result, circular=True) print prods.circular[0].overlap_sizes;raw_input("!!!") import time for p in prods.circular[0].sticky_fragments: ape(p) time.sleep(1) for x in prods.linear: print len(x.result) # for seq, spc in zip(prods.circular[0].sticky_fragments, prods.circular[0].sticky_offsets): # print " "*spc+str(seq.seq) # for seq, spc in zip(prods.circular[0].source_fragments, prods.circular[0].source_offsets): # print " "*spc+str(seq.seq) ''' sticky s e s e 1 1 2 2 | | | | <------> %abA%%--2479--!!OmD!! <--------------> !!OmD!!---5681---------=pX1= <------> =pX1=--2389--%%abA%% <------------> s e s e 1 1 2 2 | | | | <------> =pX1=%%abA%%--2577--!!OmD!! <--------------> --------!!OmD!!---5681---------=pX1= <------> =pX1=--2389--%%abA%%!!OmD!! sticky 0 (2479) 2418 (5681) 8001 (2389) source 0 (2479) 2516 (5681) 8099 (2389) ''' # import subprocess # import tempfile, os # with tempfile.NamedTemporaryFile() as temp: # temp.write('Some data') # temp.flush() # temp.seek(0) # print temp.name, os.path.isfile(temp.name) # print temp.read() # print "gedit {}".format(temp.name) # p = subprocess.Popen( "gedit {}".format(temp.name) )