--- title: Core module keywords: fastai sidebar: home_sidebar summary: "API details" description: "API details" nb_path: "nbs/00_Core.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
!which python
/home/yh3455/miniconda3/envs/seqpy3v0/bin/python
{% endraw %}

1. parameters

{% raw %}
from SEQLinkage.Main import *
{% endraw %}

args = Args().parser.parse_args(['--fam','../sample_i/rare_positions/sample_i_coding.hg38_multianno.fam', '--vcf', '../sample_i/rare_positions/sample_i_coding.hg38_multianno.vcf.gz', '--blueprint','./data/vipgenemap.hg38.txt','-f','MERLIN', '--tempdir','./Tempdir_s1', '--build', 'hg38', '--freq', 'AF', '-K', '0.001', '--moi', 'AD', '-W', '0', '-M', '1', '--theta-max', '0.5', '--theta-inc', '0.05','--run-linkage', '--output', './testseqlink'])

args = Args().parser.parse_args(['--fam','../sample_i/rare_positions/sample_i_coding.hg38_multianno.fam', '--vcf', '../sample_i/vcf/small_sample_i.vcf.gz', '--blueprint','./data/vipgenemap.hg38.txt','-f','MERLIN', '--tempdir','./Tempdir_s1', '--build', 'hg38', '--freq', 'AF', '-K', '0.001', '--moi', 'AD', '-W', '0', '-M', '1', '--theta-max', '0.5', '--theta-inc', '0.05','--run-linkage', '--output', './testseqlink'])

args = Args().parser.parse_args(['--fam','./seqlinkage-example/seqlinkage-example.fam','--vcf','./seqlinkage-example/seqlinkage-example.vcf.gz','-f','MERLIN', '--tempdir','./seqlinkage-example/tmprst', '--build','hg19','--blueprint','./seqlinkage-example/twogenomap.txt','--freq','EVSEAAF','-K','0.001','--moi','AR','-W','0','-M','1', '--theta-max','0.5','--theta-inc','0.05','--run-linkage','--output','./seqlinkage-example/tsq20211130'])

?shoud we set mle parameter as true?

args = Args().parser.parse_args(['--fam','../MWE/sample2_uniq.fam', '--vcf', '../MWE/sample_ii_coding.hg38_multianno.vcf.gz', '--blueprint','../MWE/genemap.hg38.txt', '--chrom-prefix','1','-f','MERLIN', '--tempdir','./Tempdir', '--build', 'hg38', '--freq', 'AF', '-K', '0.001', '--moi', 'AD', '-W', '0', '-M', '1', '--theta-max', '0.5', '--theta-inc', '0.05','--run-linkage', '--output', './testseqlink1105'])

args = Args().parser.parse_args('--fam seqlinkage-example/seqlinkage-example.fam --vcf seqlinkage-example/seqlinkage-example.vcf.gz -f MERLIN --blueprint data/genemap.txt --freq EVSEAAF'.split())

{% raw %}
args = Args().parser.parse_args('--fam data/mwe_normal_fam.csv --vcf data/first1000snp_full_samples.vcf.gz -f MERLIN --blueprint data/genemap.hg38.txt --freq AF'.split())
{% endraw %} {% raw %}
args
Namespace(bin=0.8, blueprint='data/genemap.hg38.txt', single_markers=False, tfam='data/mwe_normal_fam.csv', vcf='data/first1000snp_full_samples.vcf.gz', build='hg19', prephased=False, freq='AF', freq_by_fam=None, mle=False, rvhaplo=False, recomb_max=1, recomb_cross_fam=False, rsq=0.0, include_vars=None, maf_cutoff=1.0, chr_prefix=None, output=None, format=['MERLIN'], prevalence=None, inherit_mode=None, wild_pen=None, muta_pen=None, theta_max=0.5, theta_inc=0.05, run_linkage=False, output_limit=10, jobs=16, tempdir=None, vanilla=True, quiet=False, debug=False, no_save=False, func=<function main at 0x2b8c25c03c10>)
{% endraw %}

2.from Core import deepcopy

{% raw %}
{% endraw %}

3. RData class

{% raw %}

class RData[source]

RData(vcf, tfam) :: dict

dict() -> new empty dictionary dict(mapping) -> new dictionary initialized from a mapping object's (key, value) pairs dict(iterable) -> new dictionary initialized as if via: d = {} for k, v in iterable: d[k] = v dict(**kwargs) -> new dictionary initialized with the name=value pairs in the keyword argument list. For example: dict(one=1, two=2)

{% endraw %} {% raw %}
{% endraw %}

4.RegionExtractor class

{% raw %}
class RegionExtractor:
    '''Extract given genomic region from VCF
    converting genotypes into dictionary of
    genotype list'''
    def __init__(self, filename, build = env.build, chr_prefix = None, allele_freq_info = None, include_vars_file=None):
        self.vcf = cstatgen.VCFstream(filename)
        self.chrom = self.startpos = self.endpos = self.name = None
        self.chr_prefix = chr_prefix
        # name of allele frequency meta info
        self.af_info = allele_freq_info
        self.xchecker = PseudoAutoRegion('X', build)
        self.ychecker = PseudoAutoRegion('Y', build)
        self.include_vars_file = include_vars_file

    def apply(self, data):
        # Clean up
        data.reset()
        data.chrom = self.chrom
        self.vcf.Extract(self.chrom, self.startpos, self.endpos)
        varIdx = 0
        # for each variant site
        while (self.vcf.Next()):
            # skip tri-allelic sites
            if not self.vcf.IsBiAllelic():
                with env.triallelic_counter.get_lock():
                    env.triallelic_counter.value += 1
                continue
            if len(data.variants) > 0:
                if self.vcf.GetPosition()==data.variants[-1][1]:
                    continue
            # check if the line's sample number matches the entire VCF sample number
            if not self.vcf.CountSampleGenotypes() == self.vcf.sampleCount:
                raise ValueError('Genotype and sample mismatch for region {}: {:,d} vs {:,d}'.\
                             format(self.name, self.vcf.CountSampleGenotypes(), self.vcf.sampleCount))
            # valid line found, get variant info
            try:
                if type(self.af_info) is list:
                    maf = []
                    large_maf = []
                    for pop_info in self.af_info:
                        large_maf.append(False)
                        try:
                            maf.append(float(self.vcf.GetInfo(pop_info)))
                        except ValueError:
                            maf.append(0.0)
                    for idx in range(len(maf)):
                        if maf[idx] > 0.5:
                            large_maf[idx]=True
                            maf[idx] = 1-maf[idx]
                else:
                    large_maf=False
                    try:
                        maf = float(self.vcf.GetInfo(self.af_info)) if self.af_info else None
                    except ValueError:
                        maf = 0.0
                    if maf > 0.5:
                        large_maf=True
                        maf = 1 - maf
            except Exception:
                raise ValueError("VCF line {}:{} does not have valid allele frequency field {}!".\
                                 format(self.vcf.GetChrom(), self.vcf.GetPosition(), self.af_info))
            data.variants.append([self.vcf.GetChrom(), self.vcf.GetPosition(), self.name, maf])
            # for each family assign member genotype if the site is non-trivial to the family
            for k in data.families:
                gs = self.vcf.GetGenotypes(data.famsampidx[k])
                if len(data.freq_by_fam) > 0:
                    popidx=self.af_info.index(data.freq_by_fam[k])
                    if large_maf[popidx]:
                        tmpgs=[]
                        for tmpg in gs:
                            if tmpg=='00':
                                tmpgs.append(tmpg)
                            else:
                                tmpgs.append(''.join([str(3-int(tg)) for tg in tmpg]))
                        gs=tuple(tmpgs)
                else:
                    if large_maf:
                        tmpgs=[]
                        for tmpg in gs:
                            if tmpg=='00':
                                tmpgs.append(tmpg)
                            else:
                                tmpgs.append(''.join([str(3-int(tg)) for tg in tmpg]))
                        gs=tuple(tmpgs)
                for person, g in zip(data.families[k], gs):
                    data.genotype_all[person].append(g)
                if len(set(''.join(gs))) <= 1:
                    # skip monomorphic gs
                    continue
                else:
                    if len(set(''.join([x for x in gs if x != "00"]))) <= 1:
                        data.wtvar[k].append(varIdx)
                    # this variant is found in the family
                    data.famvaridx[k].append(varIdx)
                    for person, g in zip(data.families[k], gs):
                        data[person].append(g)
            varIdx += 1
        #
        if varIdx == 0:
            return 1
        else:
            if not self.include_vars_file is None:
                with open(self.include_vars_file) as invar_fh:
                    for invar_line in invar_fh:
                        chrom, pos = invar_line.split()
                        for vidx,v in enumerate(data.variants):
                            if v[0] == chrom and v[1] == int(pos):
                                data.include_vars.append("{}".format(pos))
                                break
            else:
                data.include_vars = ["{}".format(item[1]) for item in data.variants]
            with env.variants_counter.get_lock():
                env.variants_counter.value += varIdx
            return 0


    def getRegion(self, region):
        self.chrom, self.startpos, self.endpos, self.name = region[:4]
        self.startpos = int(self.startpos)
        self.endpos = int(self.endpos) + 1
        if self.chrom in ['X','23']:
            if self.xchecker.check(self.startpos) or self.xchecker.check(self.endpos):
                self.chrom = 'XY'
        if self.chrom in ['Y','24']:
            if self.ychecker.check(self.startpos) or self.ychecker.check(self.endpos):
                self.chrom = 'XY'
        if self.chr_prefix and not self.chrom.startswith(self.chr_prefix):
            self.chrom = self.chr_prefix + self.chrom
{% endraw %}

5.MarkerMaker class

{% raw %}
class MarkerMaker:
    def __init__(self, wsize, maf_cutoff = None,single_markers=False,recomb_max = 1,af_info=None,freq_by_fam=False,rsq=0.0,mle=False,rvhaplo=False,recomb_perfam=True):
        self.missings = ("0", "0")
        self.gtconv = {'1':0, '2':1}
        self.recomb_max = recomb_max
        self.haplotyper = cstatgen.HaplotypingEngine(verbose = env.debug)
        self.af_info = af_info
        self.freq_by_fam = freq_by_fam
        self.rsq=rsq
        self.mle=mle          #use MLE estimate from families for MAF
        self.count= not mle   #count founder alleles to estimate MAF
        self.rvhaplo=rvhaplo
        self.recomb_perfam=recomb_perfam
        if wsize == 0 or wsize >= 1:
            self.r2 = None
        else:
            self.r2 = wsize
        self.coder = cstatgen.HaplotypeCoder(wsize)
        self.maf_cutoff = maf_cutoff
        self.single_markers = single_markers
        self.name = None

    def apply(self, data):
        # temp raw haplotype, maf and variant names data
        haplotypes = OrderedDict()
        mafs = {}   ##Per fam per variant
        uniq_vars = []
        exclude_vars = []
        varnames = {}
        recombPos = {}
        #try:
            # haplotyping plus collect found allele counts
            # and computer founder MAFS
        self.__Haplotype(data, haplotypes, mafs, varnames,recombPos,uniq_vars,exclude_vars)
        print('__Haplotype',haplotypes, mafs, varnames,recombPos,uniq_vars,exclude_vars)
        self.haplotypes, self.mafs, self.varnames = haplotypes, mafs, varnames  ###anno
        if len(varnames):
            if not any ([len(varnames[x]) - 1 for x in varnames]):
                # all families have only one variant
                self.__AssignSNVHaplotypes(data, haplotypes, mafs, varnames)
            else:
                # calculate LD clusters using founder haplotypes
                clusters = self.__ClusterByLD(data, haplotypes, varnames)
                # recoding the genotype of the region
                self.__CodeHaplotypes(data, haplotypes, mafs, varnames, clusters)
    #except Exception as e:
        #    if env.debug:
        #        raise
        #    return -1
        self.__FormatHaplotypes(data,recombPos,varnames,uniq_vars)
        return 0

    def __getMLEfreq(self,data, markers_to_analyze, pos_all, families, rsq, output_log):
        output_sample=[]
        mle_mafs={}
        if len(markers_to_analyze)==0:
            return mle_mafs
        for fam in families:
            for person in data.tfam.sort_family(fam):
                output_sample.append([])
                last_ele=len(output_sample)-1
                output_sample[last_ele] = data.tfam.samples[person][:-1]
                if person in data.samples:
                    for marker in markers_to_analyze:
                        idx=int(marker.split('-')[0][1:])
                        output_sample[last_ele].append(data.genotype_all[person][idx])
                else:
                    output_sample[last_ele].extend(["00"] * len(markers_to_analyze))
        with stdoutRedirect(to = output_log):
            af = self.haplotyper.Execute(data.chrom, markers_to_analyze, pos_all, output_sample, rsq, output_log,False)
        with open(output_log) as mle_fh:
            for line in mle_fh:
                if line.startswith('V'):
                    tmp_eles = line.split(':')
                    if tmp_eles[0] not in mle_mafs:
                        freqs=tmp_eles[1].split()
                        mle_maf = float(freqs[1])
                        if mle_maf>0.5:
                            mle_mafs[tmp_eles[0]]=float("%.9f"%(1-mle_maf))
                        else:
                            #alt allele is more frequent
                            mle_mafs[tmp_eles[0]]=float("%.9f"%mle_maf)
                            marker_idx=int(tmp_eles[0].split('-')[0][1:])
                            for fam in families:
                                if marker_idx not in data.famvaridx[fam]:
                                    continue
                                tmp_famvaridx=data.famvaridx[fam].index(marker_idx)
                                for person in data.families[fam]:
                                    tmpg=data.genotype_all[person][marker_idx]
                                    tmpg_switch=''.join([str(3-int(tg)) for tg in tmpg]) if tmpg!='00' else tmpg
                                    data.genotype_all[person][marker_idx]=tmpg_switch
                                    tmpg2=data[person][tmp_famvaridx]
                                    tmpg_switch2=''.join([str(3-int(tg)) for tg in tmpg2]) if tmpg2!='00' else tmpg2
                                    data[person][tmp_famvaridx]=tmpg_switch2
        return mle_mafs

    def __computefounderfreq(self,data, families):
        #count founder alleles to estimate MAF
        total_founder_alleles=0
        tmp_haplotypes=OrderedDict()
        tmp_mafs={}
        for item in families:
            tmp_haplotypes[item] = self.__PedToHaplotype(data.getFamSamples(item))
            # count founder alleles
            for hap in tmp_haplotypes[item]:
                if not data.tfam.is_founder(hap[1]):
                    continue
                total_founder_alleles+=1.0
                for idxv, v in enumerate(data.getFamVariants(item,style="map")[0]):
                    if v not in tmp_mafs:
                        # [#alt, #haplotypes]
                        tmp_mafs[v] = [0, 0]
                    gt = hap[2 + idxv][1] if hap[2 + idxv][0].isupper() else hap[2 + idxv][0]
                    if not gt == "?":
                    #genotyped
                        tmp_mafs[v][0] += self.gtconv[gt]
                    else:
                    #genotype is missing
                        tmp_mafs[v][1] -= 1.0
        #compute MAFs based on counts
        for v in tmp_mafs:
            if type(tmp_mafs[v]) is not list:
                continue
            tmp_mafs[v] = tmp_mafs[v][0] / (tmp_mafs[v][1]+total_founder_alleles) if tmp_mafs[v][1]+total_founder_alleles > 0 else 0.0
        return tmp_mafs

    def __Haplotype(self, data, haplotypes, mafs, varnames,recombPos,uniq_vars,exclude_vars):
        '''genetic haplotyping. haplotypes stores per family data'''
        # FIXME: it is SWIG's (2.0.12) fault not to properly destroy the object "Pedigree" in "Execute()"
        # So there is a memory leak here which I tried to partially handle on C++
        #
        # Per family haplotyping
        #
        self.markers = ["V{}-{}".format(idx, item[1]) for idx, item in enumerate(data.variants)]
        for item in data.families:
            varnames[item], positions, vcf_mafs = data.getFamVariants(item, style = "map")
            if len(varnames[item]) == 0:
                for person in data.families[item]:
                    data[person] = self.missings
                continue
            if env.debug:
                with env.lock:
                    sys.stderr.write('\n'.join(['\t'.join(x) for x in data.getFamSamples(item)]) + '\n\n')
            # haplotyping
            self.hap = {}
            with env.lock:
                if not env.prephased:
                    tmp_log_output=env.tmp_log + str(os.getpid()) 
                    #with stdoutRedirect(to = tmp_log_output + '.log'):
                    haplotypes[item] = self.haplotyper.Execute(data.chrom, varnames[item], sorted(positions), 
                                                                   data.getFamSamples(item), self.rsq, tmp_log_output)[0]
                    print('haplotyper execute',item,haplotypes[item])
                    self.hap[item] = haplotypes[item]
                else:
                    haplotypes[item] = self.__PedToHaplotype(data.getFamSamples(item))
           
            if len(haplotypes[item]) == 0:
                # C++ haplotyping implementation failed
                with env.chperror_counter.get_lock():
                    env.chperror_counter.value += 1
            # either use privided MAF or computer MAF
            if all(vcf_mafs):
                for idx, v in enumerate(varnames[item]):
                    if v not in mafs:
                        mafs[v] = vcf_mafs[idx]
            else:
                # count founder alleles
                for hap in haplotypes[item]:
                    if not data.tfam.is_founder(hap[1]):
                        continue
                    for idxv, v in enumerate(varnames[item]):
                        if v not in mafs:
                            # [#alt, #haplotypes]
                            mafs[v] = [0, 0]
                        gt = hap[2 + idxv][1] if hap[2 + idxv][0].isupper() else hap[2 + idxv][0]
                        if not gt == "?":
                            mafs[v][0] += self.gtconv[gt]
                            mafs[v][1] += 1.0
        #
        # Compute founder MAFs
        #
        for v in mafs:
            if type(mafs[v]) is not list:
                continue
            mafs[v] = mafs[v][0] / mafs[v][1] if mafs[v][1] > 0 else 0.0
        if env.debug:
            with env.lock:
                print("variant mafs = ", mafs, "\n", file = sys.stderr)
        #
        # Drop some variants if maf is greater than given threshold
        #
        if self.maf_cutoff is not None:
            exclude_vars = []
            for v in mafs.keys():
                if mafs[v] > self.maf_cutoff:
                    exclude_vars.append(v)
            for i in haplotypes.keys():
                haplotypes[i] = listit(haplotypes[i])
                for j in range(len(haplotypes[i])):
                    haplotypes[i][j] = haplotypes[i][j][:2] + \
                      [x for idx, x in enumerate(haplotypes[i][j][2:]) if varnames[i][idx] not in exclude_vars]
                varnames[i] = [x for x in varnames[i] if x not in exclude_vars]
                # handle trivial data
                if len(varnames[i]) == 0:
                    for person in data.families[i]:
                        data[person] = self.missings
                    del varnames[i]
                    del haplotypes[i]
            # count how many variants are removed
            with env.commonvar_counter.get_lock():
                env.commonvar_counter.value += len(exclude_vars)

    def __ClusterByLD(self, data, haplotypes, varnames):
        if self.r2 is None:
            return None
        # get founder haplotypes
        founder_haplotypes = []
        markers = sorted(set(itertools.chain(*varnames.values())), key = lambda x: int(x.split("-")[0][1:]))
        for item in haplotypes:
            for ihap, hap in enumerate(haplotypes[item]):
                if not data.tfam.is_founder(hap[1]):
                    continue
                gt = [hap[2 + varnames[item].index(v)] if v in varnames[item] else '?' for v in markers]
                founder_haplotypes.append(("{}-{}".format(hap[1], ihap % 2), "".join([x[1] if x[0].isupper() else x[0] for x in gt])))
        # calculate LD blocks, use r2 measure
        ld = Align.create(founder_haplotypes).matrixLD(validCharacters="12")["r2"]
        blocks = []
        for j in ld:
            block = [j]
            for k in ld[j]:
                if ld[j][k] > self.r2:
                    block.append(k)
            if len(block) > 1:
                blocks.append(block)
        self.ld, self.blocks = ld, blocks
        # get LD clusters
        clusters = [[markers[idx] for idx in item] for item in list(connected_components(blocks))]
        if env.debug:
            with env.lock:
                print("LD blocks: ", blocks, file = sys.stderr)
                print("LD clusters: ", clusters, file = sys.stderr)
        return clusters


    def __CodeHaplotypes(self, data, haplotypes, mafs, varnames, clusters):
        # apply CHP coding
        for item in data.famvaridx:
            if item not in haplotypes and data[data.families[item][0]] != ('0','0'):
                # when only wild-type haplotypes are present in a family, still code them instead of ignoring the family
                if self.freq_by_fam:
                    pop=data.freq_by_fam[item]
                    try:
                        varnames[item]=data.total_varnames[pop]
                        mafs[item]=data.total_mafs[pop]
                    except:
                        continue
                else:
                    varnames[item]=data.total_varnames['pop']
                    mafs[item]=data.total_mafs
                haplotypes[item]=[]
                for person in data.families[item]:
                    tmp_person=[item, person]
                    if '00' in data[person]:
                        tmp_person+=['?:']*len(varnames[item])
                    else:
                        tmp_person+=['1:']*len(varnames[item])
                    haplotypes[item].append(tmp_person)
                    haplotypes[item].append(tmp_person)
            elif item in haplotypes:
                nonvar_hap_flag=False
                #determine if wild-type haplotype is present in a family
                for hap in haplotypes[item]:
                    tmp_genes=[]
                    for tmpa in hap[2:]:
                        if 'A' in tmpa or 'B' in tmpa:
                            tmp_genes.append(tmpa[1])
                        else:
                            tmp_genes.append(tmpa[0])
                    if set(tmp_genes)==set(['1']):
                        #non variant haplotype
                        nonvar_hap_flag=True
                        break
                if not nonvar_hap_flag:
                    #if family don't have wild-type haplotype, add a fake one to ensure correct coding
                    var_num=len(varnames[item])
                    fake_person=[item, 'FAKEPERSON']+['1:']*var_num
                    haplotypes[item].append(fake_person)
                for hidx,hap in enumerate(haplotypes[item]):
                    if hap[1] in data.missing_persons:
                        missing_person=[item,hap[1]]+['?:']*len(varnames[item])
                        haplotypes[item][hidx]=missing_person

        if not clusters is None:
            clusters_idx = [[[varnames[item].index(x) for x in y] for y in clusters] for item in haplotypes]
        else:
            clusters_idx = [[[]] for item in haplotypes]
        if env.debug:
            for item in haplotypes:
                with env.lock:
                    print(varnames[item],file=sys.stderr)
                    print("hap{0}\t{1}\n".format(item,haplotypes[item]),file=sys.stderr)
        self.coder.Execute(haplotypes.values(), [[mafs[item][v] for v in varnames[item]] for item in haplotypes], clusters_idx)
        if env.debug:
            with env.lock:
                if clusters:
                    print("Family LD clusters: ", clusters_idx, "\n", file = sys.stderr)
                self.coder.Print()
        # line: [fid, sid, hap1, hap2]
        for line in self.coder.GetHaplotypes():
            if not line[1] in data:
                # this sample is not in VCF file. Every variant site should be missing
                # they have to be skipped for now
                continue
            data[line[1]] = (line[2].split(','), line[4].split(','))
            #sub-region count for each sample individual
            superMarkerCount=len(data[line[1]][0])
            if line[0] not in data.patterns:
                data.patterns[line[0]]=[[] for x in range(superMarkerCount)]
            for t_Marker in range(superMarkerCount):
                t_pat1=line[3].split(',')[t_Marker]
                t_pat2=line[5].split(',')[t_Marker]
                if t_pat1 not in data.patterns[line[0]][t_Marker]:
                    data.patterns[line[0]][t_Marker].append(t_pat1)
                if t_pat2 not in data.patterns[line[0]][t_Marker]:
                    data.patterns[line[0]][t_Marker].append(t_pat2)
            if len(data[line[1]][0]) > data.superMarkerCount:
                data.superMarkerCount = len(data[line[1]][0])
        # get MAF
        for item in data.famvaridx:
            if item not in haplotypes:
                for person in data.families[item]:
                    data[person]=(['0']*data.superMarkerCount,['0']*data.superMarkerCount)
        for item in haplotypes:
            data.maf[item] = self.coder.GetAlleleFrequencies(item)
            if not len(data.maf[item][0]):
                continue
            data.varnames_by_fam[item]=varnames[item]
            wt_maf=0
            if self.freq_by_fam:
                try:
                    wt_maf=data.wt_maf[data.freq_by_fam[item]]
                except:
                    pass
            else:
                wt_maf=data.wt_maf['pop']
            tmp_data_maf=[]
            for v in data.maf[item]:
                if len(v)==1:
                    tmp_data_maf.append((v[0],1-v[0]))
                else:
                    if np.sum(v)<1:
                        tmp_ratio=sum(v[1:])/(1-wt_maf)
                        tmp_list=[wt_maf]
                        if tmp_ratio==0:
                            tmp_list.append(1-wt_maf)
                        else:
                            for tmpv in v[1:]:
                                tmp_list.append(tmpv/tmp_ratio)
                        tmp_data_maf.append(tuple(tmp_list))
                    else:
                        tmp_data_maf.append(v)
            data.maf[item]=tuple(tmp_data_maf)
        if env.debug:
            with env.lock:
                print("marker freqs = ", data.maf, "\n", file = sys.stderr)


    def __AssignSNVHaplotypes(self, data, haplotypes, mafs, varnames):
        for item in haplotypes:
            # each person's haplotype
            data.varnames_by_fam[item]=varnames[item]
            token = ''
            for idx,line in enumerate(haplotypes[item]):
                if line[1] in data.missing_persons:
                    data[line[1]]=('0','0')
                else:
                    if not idx % 2:
                        token = line[2][1] if line[2][0].isupper() else line[2][0]
                        if token=='?':
                            token='0'
                    else:
                        tmp_token = line[2][1] if line[2][0].isupper() else line[2][0]
                        if tmp_token=='?':
                            tmp_token='0'
                        data[line[1]] = (token, tmp_token)

            # get MAF
            data.maf[item] = [(1 - mafs[item][varnames[item][0]], mafs[item][varnames[item][0]])]
            data.maf[item] = tuple(tuple(np.array(v) / np.sum(v)) if np.sum(v) else v
                              for v in data.maf[item])
        for item in data.famvaridx:
            if item not in haplotypes and data[data.families[item][0]] != ('0','0'):
                for person in data.families[item]:
                    if '00' in data[person]:
                        data[person]=('0','0')
                    else:
                        data[person]=('1','1')
                t_maf=0
                if self.freq_by_fam:
                    try:
                        t_maf=data.wt_maf[data.freq_by_fam[item]]
                    except:
                        for person in data.families[item]:
                            data[person]=('0','0')
                else:
                    t_maf=data.wt_maf['pop']
                data.maf[item]=((t_maf,1-t_maf),)
        if env.debug:
            with env.lock:
                print("marker freqs = ", data.maf, "\n", file = sys.stderr)


    def __FormatHaplotypes(self, data,recombPos,varnames,uniq_vars):
        # Reformat sample genotypes
        ## Linhai Edit: Reformat to deal with recombination events in families
        if self.recomb_perfam:
            #code recombination per family basis, no need to consider overlap across families
            for person in data:
                if type(data[person]) is not tuple:
                    data[person] = self.missings
                    continue
                diff = data.superMarkerCount - len(data[person][0])
                data[person] = zip(*data[person])
                if diff > 0:
                    data[person].extend([self.missings] * diff)
        else:
            #code recombination across families to generate sub-regions that extend across families
            tmp_combined_recombPos={}
            sorted_var = sorted(uniq_vars, key=lambda x: int(x.split('-')[0][1:]))
            for fam in data.maf.keys():
                if len(data.maf[fam])>1:
                    for pair in sorted(recombPos[fam].keys(), key=lambda x:(sorted_var.index(x[0]),sorted_var.index(x[1]))):
                        if pair[1] == varnames[fam][0]:
                            ##remove recombination event if occurred at 1st RV
                            del recombPos[fam][pair]
                            continue
                        if fam not in tmp_combined_recombPos:
                            tmp_combined_recombPos[fam]=[pair]
                        else:
                            tmp_combined_recombPos[fam].append(pair)
            tmp_all_recombs=[pair for pairs in tmp_combined_recombPos.values() for pair in pairs]
            sorted_combined_recombPos=sorted(list(set(tmp_all_recombs)),key=lambda x:(sorted_var.index(x[0]),sorted_var.index(x[1])))
            recomb_fams=tmp_combined_recombPos.keys()
            ##get sub-regions that applies to all families
            for varidx,variant in enumerate(sorted_var):
                included_fams=len(recomb_fams)
                for recomb_region in sorted_combined_recombPos:
                    if varidx > sorted_var.index(recomb_region[0]) and varidx < sorted_var.index(recomb_region[1]):
                        ##if the variant is in a recombination region
                        included_fams-=1
                if included_fams==len(recomb_fams):
                    if data.combined_regions==[]:
                        data.combined_regions.append([variant])
                    else:
                        if sorted_var.index(data.combined_regions[-1][-1])==varidx-1:
                            neighbour_recomb_flag=False
                            for recomb_region in sorted_combined_recombPos:
                                recomb_idx=sorted_var.index(recomb_region[1])
                                if recomb_idx==varidx:
                                    neighbour_recomb_flag=True
                                    break
                                elif recomb_idx>varidx:
                                    break
                            if neighbour_recomb_flag:
                                data.combined_regions.append([variant])
                            else:
                                data.combined_regions[-1].append(variant)
                        else:
                            data.combined_regions.append([variant])
            ##Get the markers in families compliant with the sub_regions
            for sub_region in data.combined_regions:
                markers={}
                for fam in recomb_fams:
                    pidx=0
                    for pair in sorted(recombPos[fam].keys(), key=lambda x:(sorted_var.index(x[0]),sorted_var.index(x[1]))):
                        sub_region_start=sorted_var.index(sub_region[0])
                        sub_region_end=sorted_var.index(sub_region[-1])
                        recomb_start=sorted_var.index(pair[0])
                        recomb_end=sorted_var.index(pair[1])
                        if sub_region_end <= recomb_start:
                            markers[fam]=pidx
                            break
                        elif sub_region_end > recomb_start and sub_region_start>recomb_start and sub_region_end<recomb_end:
                            ##within the recombination region
                            markers[fam]=None
                            break
                        pidx+=1
                    if fam not in markers:
                        markers[fam]=pidx
                data.complied_markers.append(markers)
            data.superMarkerCount=len(data.combined_regions)
            #coordinates for sub_regions
            data.coordinates_by_region=[(int(sub_region[0].split('-')[1])+int(sub_region[-1].split('-')[1]))/2 for sub_region in data.combined_regions]
            for person in data:
                if type(data[person]) is not tuple:
                    data[person] = self.missings
                    continue
                diff = data.superMarkerCount - len(data[person][0])
                data[person] = zip(*data[person])
                if diff > 0:
                    if len(data[person]) == 1:
                        ##only one whole region with no recombination
                        data[person].extend(data[person] * diff)
                    else:
                        famid=''
                        for fam in data.complied_markers[0].keys():
                            if person in data.families[fam]:
                                famid=fam
                        complied_data=[]
                        for marker in data.complied_markers:
                            complied_data.append(data[person][marker[famid]])
                        data[person]=complied_data

    def __PedToHaplotype(self, ped):
        '''convert prephased ped format to haplotype format.
        Input: e.g. [['13346', '5888', '0', '0', '1', '11', '11', '11'], ['13346', '5856', '0', '0', '2', '12', '12', '12'], ['13346', '5920', '5888', '5856', '1', '12', '12', '12'], ['13346', '6589', '5888', '5856', '1', '11', '11', '11']]
        Output: e.g. (('13346', '5856', '1:', '1:', '1:'), ('13346', '5856', '2:', '2:', '2:'), ('13346', '5888', '1:', '1:', '1:'), ('13346', '5888', '1:', '1:', '1:'), ('13346', '6589', '1:', '1|', '1|'), ('13346', '6589', '1:', '1|', '1|'), ('13346', '5920', '2:', '2|', '2|'), ('13346', '5920', '1:', '1|', '1|'))
        '''
        haps = []
        for item in ped:
            entry = [item[0], item[1]] + [x[0] + ':' if x[0] != '0' else '?:' for x in item[5:]]
            haps.append(tuple(entry))
            entry = [item[0], item[1]] + [x[1] + ':' if x[1] != '0' else '?:' for x in item[5:]]
            haps.append(tuple(entry))
        return tuple(haps)

    def getRegion(self, region):
        self.name = region[3]
{% endraw %}

6.LinkageWriter class

{% raw %}
class LinkageWriter:
    def __init__(self, num_missing_append = 0):
        self.chrom = self.prev_chrom = self.name = self.distance = self.distance_avg = self.distance_m = self.distance_f = None
        self.distance_by_region=[]
        self.mid_position=None
        self.reset()
        self.missings = ["0", "0"]
        self.num_missing = num_missing_append

    def apply(self, data):
        if self.chrom != self.prev_chrom:
            if self.prev_chrom is None:
                self.prev_chrom = self.chrom
            else:
                # new chrom entered,
                # commit whatever is in buffer before accepting new data
                self.commit()
        # write tped output
        position = str(data.getMidPosition())
        if data.superMarkerCount <= 1:
            # genotypes
            gs = [data[s][0] for s in data.samples]
            if len(set(gs)) == 1:
                # everyone's genotype is the same (most likely missing or monomorphic)
                return 2
            self.tped += env.delimiter.join([self.chrom, self.name, self.distance, position] + \
                list(itertools.chain(*gs)) + self.missings*self.num_missing) + "\n"
            # freqs
            for k in data.maf:
                self.freq += env.delimiter.join([k, self.name] + map(str, data.maf[k][0])) + "\n"
        else:
            # have to expand each region into mutiple chunks to account for different recomb points
            gs = zip(*[data[s] for s in data.samples])
            # sub-chunk id
            cid = 0
            skipped_chunk = []
            self.distance_by_region=[self.distance_converter(x,int(position)) for x in data.coordinates_by_region]
            for idx, g in enumerate(gs):
                if len(set(g)) == 1:
                    skipped_chunk.append(idx)
                    continue
                cid += 1
                self.tped += \
                  env.delimiter.join([self.chrom, '{}[{}]'.format(self.name, cid), self.distance_by_region[cid-1], position] + \
                  list(itertools.chain(*g)) + self.missings*self.num_missing) + "\n"
            if cid == 0:
                # everyone's genotype is the same (most likely missing or monomorphic)
                return 2
            # freqs
            for k in data.maf:
                cid = 0
                for idx in range(data.superMarkerCount):
                    if idx in skipped_chunk:
                        continue
                    if not data.complied_markers:
                        #if recombination coded per family instead of across families
                        if idx >= len(data.maf[k]):
                            break
                        cid += 1
                        self.freq += env.delimiter.join([k, '{}[{}]'.format(self.name, cid)] + \
                                                    map(str, data.maf[k][idx])) + "\n"
                    else:
                        if len(data.maf[k])>1:
                            matched_idx=data.complied_markers[idx][k]
                            cid += 1
                            self.freq += env.delimiter.join([k, '{}[{}]'.format(self.name, cid)] + \
                                                map(str, data.maf[k][matched_idx])) + "\n"
                        elif len(data.maf[k])==1:
                            cid += 1
                            self.freq += env.delimiter.join([k, '{}[{}]'.format(self.name, cid)] + \
                                                map(str, data.maf[k][0])) + "\n"
        if data.combined_regions:
            self.chp += "CHP Super Marker positions: "+repr(data.combined_regions)+"\n"
        for item in data.varnames_by_fam:
            try:
                pattern_txt=[tuple(sorted(data.patterns[item][tmarker],key=lambda x:x.count('2') )) for tmarker in range(len(data.patterns[item]))]
            except:
                pattern_txt=''
            self.varfam += "{}\t{}\t{}\n".format(item,data.varnames_by_fam[item],pattern_txt)
        if self.counter < env.batch:
            self.counter += data.superMarkerCount
        else:
            self.commit()
        return 0

    def commit(self):
        if self.tped:
            with env.lock:
                with open(os.path.join(env.tmp_cache, '{}.chr{}.tped'.format(env.output, self.prev_chrom)),
                          'a') as f:
                    f.write(self.tped)
        if self.freq:
            with env.lock:
                with open(os.path.join(env.tmp_cache, '{}.chr{}.freq'.format(env.output, self.prev_chrom)),
                          'a') as f:
                    f.write(self.freq)
        if self.chp:
            with env.lock:
                with open(os.path.join(env.tmp_cache, '{}.chr{}.chp'.format(env.output, self.prev_chrom)),
                          'a') as f:
                    f.write(self.chp)
        if self.varfam:
            with env.lock:
                with open(os.path.join(env.tmp_cache, '{}.chr{}.var'.format(env.output, self.prev_chrom)),
                          'a') as f:
                    f.write(self.varfam)
        self.reset()

    def reset(self):
        self.tped = ''
        self.freq = ''
        self.chp = ''
        self.varfam = ''
        self.counter = 0
        self.prev_chrom = self.chrom

    def distance_converter(self, x, mid_position):
        delta=(x-mid_position)/1000000.0
        distance='%.5f'%(float(self.distance_avg)+delta)
        distance_m='%.5f'%(float(self.distance_m)+delta)
        distance_f='%.5f'%(float(self.distance_f)+delta)
        return ";".join([distance,distance_m,distance_f])

    def getRegion(self, region):
        self.chrom = region[0]
        self.name, self.distance_avg, self.distance_m, self.distance_f = region[3:]
        self.distance = ";".join([self.distance_avg, self.distance_m, self.distance_f])
{% endraw %}

7.EncoderWorker class

{% raw %}
class EncoderWorker(Process):
    def __init__(self, queue, length, data, extractor, coder, writer):
        Process.__init__(self)
        self.queue = queue
        self.numGrps = float(length)
        self.data = data
        self.extractor = extractor
        self.maker = coder
        self.writer = writer

    def report(self):
        env.log('{:,d} units processed {{{:.2%}}} ...'.\
                format(env.success_counter.value, env.total_counter.value / self.numGrps), flush = True)

    def run(self):
        while True:
            try:
                region = self.queue.pop(0) if isinstance(self.queue, list) else self.queue.get()
                if region is None:
                    self.writer.commit()
                    self.report()
                    # total mendelian errors found
                    with env.mendelerror_counter.get_lock():
                        env.mendelerror_counter.value += self.maker.haplotyper.CountMendelianErrors()
                    # total recombination events found
                    with env.recomb_counter.get_lock():
                        env.recomb_counter.value += self.maker.coder.CountRecombinations()
                    break
                else:
                    with env.total_counter.get_lock():
                        env.total_counter.value += 1
                    self.extractor.getRegion(region)
                    self.writer.getRegion(region)
                    self.maker.getRegion(region)
                    isSuccess = True
                    for m in [self.extractor, self.maker, self.writer]:
                        status = m.apply(self.data)
                        if status == -1:
                            with env.chperror_counter.get_lock():
                                # previous module failed
                                env.chperror_counter.value += 1
                        if status == 1:
                            with env.null_counter.get_lock():
                                env.null_counter.value += 1
                        if status == 2:
                            with env.trivial_counter.get_lock():
                                env.trivial_counter.value += 1
                        if status != 0:
                            isSuccess = False
                            break
                    if isSuccess:
                        with env.success_counter.get_lock():
                            env.success_counter.value += 1
                    if env.total_counter.value % (env.batch * env.jobs) == 0:
                        self.report()
            except KeyboardInterrupt:
                break
{% endraw %}

Old version

{% raw %}
class RData(dict):
    def __init__(self, samples_vcf, tfam):
        # tfam.samples: a dict of {sid:[fid, pid, mid, sex, trait], ...}
        # tfam.families: a dict of {fid:[s1, s2 ...], ...}
        self.tfam = tfam
        # samples have to be in both vcf and tfam data
        self.samples = OrderedDict([(k, tfam.samples[k]) for k in samples_vcf if k in tfam.samples])
        # a dict of {fid:[member names], ...}
        self.families = {k : [x for x in self.samples if x in tfam.families[k]] for k in tfam.families}
        # a dict of {fid:[idx ...], ...}
        self.famsampidx = {}
        # a dict of {fid:[maf1, maf2 ...]}
        self.maf = OrderedDict()
        # reorder family samples based on order of VCF file
        for k in self.families.keys():
            if len(self.families[k]) == 0:
                # skip families having no samples in VCF file
                del self.families[k]
            else:
                self.famsampidx[k] = [i for i, x in enumerate(samples_vcf) if x in self.families[k]]
        # a dict of {fid:[idx ...], ...}
        self.famvaridx = {}
        self.gss = {} #test line
        self.reset()

    def reset(self):
        for item in self.samples:
            self[item] = []
        self.variants = []
        self.chrom = None
        for k in self.families:
            self.famvaridx[k] = []
        self.maf = OrderedDict()
        # superMarkerCount is the max num. of recombinant fragments among all fams
        self.superMarkerCount = 0
        self.gss = {} #test line

    def getMidPosition(self):
        if len(self.variants) == 0:
            return None
        return sum([x[1] for x in self.variants]) / len(self.variants)

    def getFamVariants(self, fam, style = None):
        if style is None:
            return [item for idx, item in enumerate(self.variants) if idx in self.famvaridx[fam]]
        elif style == "map":
            names = []
            pos = []
            mafs = []
            for idx in self.famvaridx[fam]:
                names.append("V{}-{}".format(idx, self.variants[idx][1]))
                pos.append(self.variants[idx][1])
                mafs.append(self.variants[idx][-1])
            return names, pos, mafs
        else:
            raise ValueError("Unknown style '{}'".format(style))

    def getFamSamples(self, fam):
        nvar = len([item for idx, item in enumerate(self.variants) if idx in self.famvaridx[fam]])
        output = [[]] * len(self.tfam.families[fam])
        for idx, item in enumerate(self.tfam.sort_family(fam)):
            # sample info, first 5 columns of ped
            output[idx] = self.tfam.samples[item][:-1]
            # sample genotypes
            if item in self.samples:
                output[idx].extend(self[item])
            else:
                output[idx].extend(["00"] * nvar)
        return output
{% endraw %}

Old maker

{% raw %}

class RegionExtractor[source]

RegionExtractor(filename, build='hg38', chr_prefix=None, allele_freq_info=None)

Extract given genomic region from VCF converting genotypes into dictionary of genotype list

{% endraw %} {% raw %}

class MarkerMaker[source]

MarkerMaker(wsize, maf_cutoff=None)

{% endraw %} {% raw %}

class LinkageWriter[source]

LinkageWriter(num_missing_append=0)

{% endraw %} {% raw %}

class EncoderWorker[source]

EncoderWorker(queue, length, data, extractor, coder, writer) :: Process

Process objects represent activity that is run in a separate process

The class is analogous to threading.Thread

{% endraw %} {% raw %}
{% endraw %}

Main function

{% raw %}
checkParams(args)
MESSAGE: Binary trait detected in [/mnt/mfs/statgen/yin/Github/linkage/SEQpy3/data/mwe_normal_fam.csv]
True
{% endraw %} {% raw %}
args.freq
'AF'
{% endraw %} {% raw %}
if args.no_save:
    cache = NoCache()
else:
    cache = Cache(env.cache_dir, env.output, vars(args))
cache.setID('vcf')
{% endraw %} {% raw %}
if not args.vanilla and cache.check():
    env.log('Loading regional marker data from archive ...')
    cache.load(target_dir = env.tmp_dir, names = ['CACHE'])
    env.success_counter.value = sum(map(fileLinesCount, glob.glob('{}/*.tped'.format(env.tmp_cache))))
    env.batch = 10
else:
    # load VCF file header
    data = RData(args.vcf, args.tfam)
    vs = data.vs
    samples_vcf = data.samples_vcf

if len(samples_vcf) == 0:
    env.error("Fail to extract samples from [{}]".format(args.vcf), exit = True)
env.log('{:,d} samples found in [{}]'.format(len(samples_vcf), args.vcf))
samples_not_vcf = data.samples_not_vcf
MESSAGE: 7 samples found in FAM file but not in VCF file:
1036_2, 22_1_10, 22_1_20, 28_9_186, 1036_1, 28_9_100, 28_9_101
MESSAGE: 3,461 samples in VCF file will be ignored due to absence in FAM file
MESSAGE: 3,479 samples found in [/mnt/mfs/statgen/yin/Github/linkage/SEQpy3/data/first1000snp_full_samples.vcf.gz]
{% endraw %} {% raw %}
len(samples_not_vcf),len(samples_vcf)
(7, 3479)
{% endraw %} {% raw %}
if len(data.families) == 0:
    env.error('No valid family to process. ' \
              'Families have to be at least trio with at least one member in VCF file.', exit = True)
if len(data.samples) == 0:
    env.error('No valid sample to process. ' \
              'Samples have to be in families, and present in both TFAM and VCF files.', exit = True)
rewriteFamfile(os.path.join(env.tmp_cache, '{}.tfam'.format(env.output)),
               data.tfam.samples, list(data.samples.keys()) + samples_not_vcf)
{% endraw %} {% raw %}
if args.single_markers:
    regions=[]
    for x in vs.GetGenomeCoordinates():
        region_info = (x[0], x[1], x[1], "{}:{}".format(x[0], x[1]), '.', '.', '.')
        if region_info not in regions:
            regions.append(region_info)
    args.blueprint = None
else:
    # load blueprint
    try:
        with open(args.blueprint, 'r') as f:
            regions = [x.strip().split() for x in f.readlines()]
    except IOError:
        env.error("Cannot load regional marker blueprint [{}]. ".format(args.blueprint), exit = True)
env.log('{:,d} families with a total of {:,d} samples will be scanned for {:,d} pre-defined units'.\
        format(len(data.families), len(data.samples), len(regions)))
env.jobs = max(min(args.jobs, len(regions)), 1)
regions.extend([None] * env.jobs)
queue = [] if env.jobs == 1 else Queue()  
MESSAGE: 3 families with a total of 18 samples will be scanned for 28,325 pre-defined units
{% endraw %}

Testing

{% raw %}
extractor =RegionExtractor(args.vcf, chr_prefix = args.chr_prefix, allele_freq_info = args.freq)
maker =            MarkerMaker(args.bin, maf_cutoff = args.maf_cutoff)
writer =             LinkageWriter(len(samples_not_vcf))
{% endraw %} {% raw %}
for j, region in enumerate(regions[:20]):
    i = 0
    #for region in rg:
    extractor.getRegion(region)
    maker.getRegion(region)
    writer.getRegion(region)
    isSuccess = True
    for m in [extractor, maker, writer]:
        status = m.apply(data)
        #print(data,data.genotype_all)
        i+=1
        if status == -1:
            with env.chperror_counter.get_lock():
                # previous module failed
                env.chperror_counter.value += 1
        if status == 1:
            with env.null_counter.get_lock():
                env.null_counter.value += 1
        if status == 2:
            with env.trivial_counter.get_lock():
                env.trivial_counter.value += 1
        if status != 0:
            isSuccess = False
            break
    if isSuccess:
        with env.success_counter.get_lock():
            env.success_counter.value += 1
    if j%1000==0:
        print(j,i,len(data.variants),region)
in Haplotype
running family 1036
running family Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V12-13302 V17-13417 V22-13687 

V8-13273: 0 0.707106 0.292894 
total familyCount:1
V12-13302: 0 0.75 0.25 
total familyCount:1
V17-13417: 0 0.75 0.25 
total familyCount:1
V22-13687: 0 0.75 0.25 
total familyCount:1
22_1
running family 28_9
Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V15-13380 V17-13417 

V8-13273: 0 0.744757 0.255243 
total familyCount:1
V15-13380: 0 0.744757 0.255243 
total familyCount:1
V17-13417: 0 0.744757 0.255243 
total familyCount:1
0 3 25 ['1', '11868', '14362', 'LOC102725121@1', '9.177127474362311e-07', '1.1657192989882668e-06', '6.814189157634088e-07']
in Haplotype
running family 1036
running familyEstimating allele frequencies... [using maximum likelihood]0
   V8-13273 V12-13302 V17-13417 V22-13687 

V8-13273: 0 0.707106 0.292894 
total familyCount:1
 22_1
running familyV12-13302: 0 0.75 0.25 
total familyCount:1
V17-13417: 0 0.75 0.25 
total familyCount:1
V22-13687: 0 0.75 0.25 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V15-13380 V17-13417 

V8-13273: 0 0.744757 0.255243 
total familyCount:1
V15-13380: 0 0.744757 0.255243 
total familyCount:1
V17-13417: 0 0.744757 0.255243 
total familyCount:1
 28_9
in Haplotype
running family 1036
running family 22_1
running family 28_9
in Haplotype
running family 1036
Estimating allele frequencies... [using maximum likelihood]0
   V3-14464 V8-14653 V18-14907 V19-14930 V20-14933 V40-16103 V50-16378 
   V55-16487 V69-17147 V79-17358 V85-17385 V89-17407 V124-17697 V151-17928 
   V152-17929 V159-20184 V160-20191 V162-20212 V165-20227 V169-20235 
   V171-20250 V178-20316 V180-20485 V182-20522 V184-20547 V194-29368 

V3-14464: 0 0.75 0.25 
total familyCount:1
V8-14653: 0 0.75 0.25 
total familyCount:1
V18-14907: 0 0.5 0.5 
total familyCount:1
V19-14930: 0 0.5 0.5 
total familyCount:1
V20-14933: 0 1 
total familyCount:1
V40-16103: 0 0.707106 0.292894 
total familyCount:1
V50-16378: 0 0.5 0.5 
total familyCount:1
V55-16487: 0 0.75 0.25 
total familyCount:1
running family V69-17147: 0 0.5 0.5 
total familyCount:1
V79-17358: 0 1 
total familyCount:1
V85-17385: 0 1 
total familyCount:1
V89-17407: 0 0.5 0.5 
total familyCount:1
V124-17697: 0 0.5 0.5 
total familyCount:1
V151-17928: 0 0.5 0.5 
total familyCount:1
V152-17929: 0 0.5 0.5 
total familyCount:1
V159-20184: 0 0.5 0.5 
total familyCount:1
V160-20191: 0 1 
total familyCount:1
V162-20212: 0 1 
total familyCount:1
V165-20227: 0 1 
total familyCount:1
V169-20235: 0 1 
total familyCount:1
V171-20250: 0 1 
total familyCount:1
V178-20316: 0 1 
total familyCount:1
V180-20485: 0 1 
total familyCount:1
V182-20522: 0 1 
total familyCount:1
V184-20547: 0 0.75 0.25 
total familyCount:1
V194-29368: 0 1 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V3-14464 V8-14653 V10-14677 V18-14907 V19-14930 V40-16103 V50-16378 
   V55-16487 V69-17147 V80-17365 V85-17385 V86-17398 V89-17407 V95-17479 
   V108-17559 V112-17589 V124-17697 V129-17722 V131-17746 V142-17829 
   V155-19190 V1522_1
running family9-20184 V160-20191 V162-20212 V165-20227 V169-20235 
   V171-20250 V178-20316 V184-20547 

V3-14464: 0 0.739454 0.260546 
total familyCount:1
V8-14653: 0 0.707103 0.292897 
total familyCount:1
V10-14677: 0 0.739454 0.260546 
total familyCount:1
V18-14907: 0 0.707103 0.292897 
total familyCount:1
V19-14930: 0 0.728713 0.271287 
total familyCount:1
V40-16103: 0 0.5 0.5 
total familyCount:1
V50-16378: 0 0.5 0.5 
total familyCount:1
V55-16487: 0 0.744757 0.255243 
total familyCount:1
V69-17147: 0 0.744757 0.255243 
total familyCount:1
V80-17365: 0 0.744757 0.255243 
total familyCount:1
V85-17385: 0 0.739454 0.260546 
total familyCount:1
V86-17398: 0 0.744757 0.255243 
total familyCount:1
V89-17407: 0 0.744757 0.255243 
total familyCount:1
V95-17479: 0 0.744757 0.255243 
total familyCount:1
V108-17559: 0 0.744757 0.255243 
total familyCount:1
V112-17589: 0 0.744757 0.255243 
total familyCount:1
V124-17697: 0 0.739454 0.260546 
total familyCount:1
V129-17722: 0 0.744757 0.255243 
total famil 28_9
yCount:1
V131-17746: 0 0.739454 0.260546 
total familyCount:1
V142-17829: 0 0.744757 0.255243 
total familyCount:1
V155-19190: 0 0.739454 0.260546 
total familyCount:1
V159-20184: 0 0.744757 0.255243 
total familyCount:1
V160-20191: 0 0.744757 0.255243 
total familyCount:1
V162-20212: 0 0.744757 0.255243 
total familyCount:1
V165-20227: 0 0.744757 0.255243 
total familyCount:1
V169-20235: 0 0.744757 0.255243 
total familyCount:1
V171-20250: 0 0.739454 0.260546 
total familyCount:1
V178-20316: 0 0.739454 0.260546 
total familyCount:1
V184-20547: 0 0.739454 0.260546 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V3-14464 V5-14470 V8-14653 V10-14677 V18-14907 V19-14930 V34-16068 
   V40-16103 V50-16378 V84-17379 V85-17385 V90-17408 V105-17519 V107-17556 
   V116-17614 V155-19190 V158-20166 V160-20191 V162-20212 V165-20227 
   V166-20227 V168-20231 V169-20235 V171-20250 V174-20254 V178-20316 
   V183-20545 V184-20547 V194-29368 

V3-14464: 0 0.646634 0.353366 
total familyCount:1
V5-14470: 0 0.824706 0.175294 
total familyCount:1
V8-14653: 0 0.452863 0.547137 
total familyCount:1
V10-14677: 0 0.656432 0.343568 
total familyCount:1
V18-14907: 0 0.5 0.5 
total familyCount:1
V19-14930: 0 0.628665 0.371335 
total familyCount:1
V34-16068: 0 0.5 0.5 
total familyCount:1
V40-16103: 0 0.628667 0.371333 
total familyCount:1
V50-16378: 0 0.617751 0.382249 
total familyCount:1
V84-17379: 0 0.824706 0.175294 
total familyCount:1
V85-17385: 0 0.5 0.5 
total familyCount:1
V90-17408: 0 0.824706 0.175294 
total familyCount:1
V105-17519: 0 0.824706 0.175294 
total familyCount:1
V107-17556: 0 0.820231 0.179769 
total familyCount:1
V116-17614: 0 0.824706 0.175294 
total familyCount:1
V155-19190: 0 0.6 0.4 
total familyCount:1
V158-20166: 0 0.628665 0.371335 
total familyCount:1
V160-20191: 0 0.628665 0.371335 
total familyCount:1
V162-20212: 0 0.628665 0.371335 
total familyCount:1
V165-20227: 0 0.656432 0.343568 
total familyCount:1
V166-20227: 0 0.646634 0.353366 
total familyCount:1
V168-20231: 0 0.811255 0.188745 
total familyCount:1
V169-20235: 0 0.824706 0.175294 
total familyCount:1
V171-20250: 0 0.628665 0.371335 
total familyCount:1
V174-20254: 0 0.824706 0.175294 
total familyCount:1
V178-20316: 0 0.628665 0.371335 
total familyCount:1
V183-20545: 0 0.820231 0.179769 
total familyCount:1
V184-20547: 0 0.617751 0.382249 
total familyCount:1
V194-29368: 0 0.656432 0.343568 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0          
   V4-17385 V8-17407 

V4-17385: 0 1 
total familyCount:1
V8-17407: 0 0.5 0.5 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V4-17385 V5-17398 V8-17407 

V4-17385: 0 0.739454 0.260546 
total familyCount:1
V5-17398: 0 0.744757 0.255243 
total familyCount:1
V8-17407: 0 0.744757 0.255243 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V3-17379 V4-17385 V9-17408 

V3-17379: 0 0.824706 0.175294 
total familyCount:1
V4-17385: 0 0.5 0.5 
total familyCount:1
V9-17408: 0 0.824706 0.175294 
total familyCount:1
{% endraw %} {% raw %}
env.dtest.keys()
dict_keys(['LOC102725121@1', 'DDX11L1', 'WASH7P', 'MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1', 'MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1', 'FAM138A@1,FAM138C@1,FAM138F@1', 'OR4F5', 'LOC729737', 'LOC100132062@1,LOC100132287@1', 'OR4F16@1,OR4F29@1,OR4F3@1', 'LOC101928626', 'MIR12136', 'OR4F16@2,OR4F29@2,OR4F3@2', 'LOC100133331', 'LOC100288069', 'FAM87B', 'LINC00115', 'LINC01128', 'FAM41C', 'LINC02593'])
{% endraw %} {% raw %}
env.dtest['LOC102725121@1']
OrderedDict([('dregions',
              [['1',
                '11868',
                '14362',
                'LOC102725121@1',
                '9.177127474362311e-07',
                '1.1657192989882668e-06',
                '6.814189157634088e-07']]),
             ('dvariants',
              [[['1', 13035, 'LOC102725121@1', 0.0002],
                ['1', 13053, 'LOC102725121@1', 0.0004],
                ['1', 13080, 'LOC102725121@1', 1e-05],
                ['1', 13115, 'LOC102725121@1', 1e-05],
                ['1', 13151, 'LOC102725121@1', 0.0001],
                ['1', 13169, 'LOC102725121@1', 0.0001],
                ['1', 13244, 'LOC102725121@1', 0.0002],
                ['1', 13248, 'LOC102725121@1', 0.0006],
                ['1', 13273, 'LOC102725121@1', 0.1548],
                ['1', 13289, 'LOC102725121@1', 0.0029],
                ['1', 13298, 'LOC102725121@1', 2.531e-05],
                ['1', 13302, 'LOC102725121@1', 0.0103],
                ['1', 13302, 'LOC102725121@1', 9.451e-05],
                ['1', 13303, 'LOC102725121@1', 0.0002],
                ['1', 13379, 'LOC102725121@1', 7.851e-05],
                ['1', 13380, 'LOC102725121@1', 0.0088],
                ['1', 13393, 'LOC102725121@1', 0.0002],
                ['1', 13417, 'LOC102725121@1', 0.1197],
                ['1', 13453, 'LOC102725121@1', 0.0003],
                ['1', 13494, 'LOC102725121@1', 0.0015],
                ['1', 13504, 'LOC102725121@1', 0.0026],
                ['1', 13687, 'LOC102725121@1', 0.0049],
                ['1', 13687, 'LOC102725121@1', 0.0001],
                ['1', 14159, 'LOC102725121@1', 0.0021],
                ['1', 14345, 'LOC102725121@1', 0.0001]]]),
             ('dfamvaridx',
              [{'1036': [8, 12, 17, 22], '22_1': [8, 15, 17], '28_9': []}]),
             ('dgeno',
              [{'28_9_103': [],
                '28_9_106': [],
                '28_9_108': [],
                '28_9_109': [],
                '28_9_111': [],
                '28_9_105': [],
                '28_9_114': [],
                '28_9_110': [],
                '1036_5': ['12', '12', '12', '11'],
                '1036_99': ['12', '11', '12', '11'],
                '1036_6': ['00', '00', '11', '12'],
                '1036_4': ['11', '12', '11', '11'],
                '1036_3': ['11', '11', '11', '11'],
                '22_1_2': ['11', '12', '11'],
                '22_1_3': ['11', '11', '11'],
                '22_1_4': ['11', '11', '11'],
                '22_1_5': ['11', '11', '11'],
                '22_1_99': ['12', '11', '12']}]),
             ('gss',
              [{0: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                1: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                2: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                3: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                4: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                5: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                6: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                7: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                8: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                9: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                10: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                11: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                12: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                13: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                14: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                15: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                16: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                17: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                18: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                19: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                20: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                21: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                22: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                23: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')},
                24: {'1036': ('11', '11', '11', '11', '11'),
                 '22_1': ('11', '11', '11', '11', '11'),
                 '28_9': ('11', '11', '11', '11', '11', '11', '11', '11')}}]),
             ('hapimp',
              {'1036': ['1036',
                ['V8-13273', 'V12-13302', 'V17-13417', 'V22-13687'],
                [13273, 13302, 13417, 13687],
                [0.1548, 9.451e-05, 0.1197, 0.0001],
                (('1036', '1036_1', '1:', '1:', '1:', '1:'),
                 ('1036', '1036_1', '?:', '?:', '?:', '?:'),
                 ('1036', '1036_2', '2:', '1:', '2:', '1:'),
                 ('1036', '1036_2', '?:', '?:', '?:', '?:'),
                 ('1036', '1036_6', '1:', '2:', '1:', '1:'),
                 ('1036', '1036_6', '1:', '1:', '1:', '2:'),
                 ('1036', '1036_99', '2:', '1|', '2|', '1|'),
                 ('1036', '1036_99', '1:', '1|', '1|', '1|'),
                 ('1036', '1036_5', '2:', '1|', '2|', '1|'),
                 ('1036', '1036_5', '1:', '2|', '1|', '1|'),
                 ('1036', '1036_4', '1:', '1|', '1|', '1|'),
                 ('1036', '1036_4', '1:', '2|', '1|', '1|'),
                 ('1036', '1036_3', '1:', '1|', '1|', '1\\'),
                 ('1036', '1036_3', '1:', '1|', '1|', '1\\'))],
               '22_1': ['22_1',
                ['V8-13273', 'V15-13380', 'V17-13417'],
                [13273, 13380, 13417],
                [0.1548, 0.0088, 0.1197],
                (('22_1', '22_1_10', '2:', '1:', '2:'),
                 ('22_1', '22_1_10', '1:', '1:', '1:'),
                 ('22_1', '22_1_20', '1:', '1:', '1:'),
                 ('22_1', '22_1_20', '1:', '2:', '1:'),
                 ('22_1', '22_1_99', '1:', '1|', '1|'),
                 ('22_1', '22_1_99', '2:', '1|', '2|'),
                 ('22_1', '22_1_5', '1:', '1|', '1|'),
                 ('22_1', '22_1_5', '1:', '1|', '1|'),
                 ('22_1', '22_1_4', '1:', '1|', '1|'),
                 ('22_1', '22_1_4', '1:', '1|', '1|'),
                 ('22_1', '22_1_3', '1:', '1|', '1|'),
                 ('22_1', '22_1_3', '1:', '1|', '1|'),
                 ('22_1', '22_1_2', '1:', '2|', '1|'),
                 ('22_1', '22_1_2', '1:', '1|', '1|'))],
               '28_9': ['28_9',
                [],
                [],
                [],
                {'V8-13273': 0.1548,
                 'V12-13302': 9.451e-05,
                 'V17-13417': 0.1197,
                 'V22-13687': 0.0001,
                 'V15-13380': 0.0088}]}),
             ('ld', []),
             ('coder',
              {'input': [{'28_9_103': ('0', '0'),
                 '28_9_106': ('0', '0'),
                 '28_9_108': ('0', '0'),
                 '28_9_109': ('0', '0'),
                 '28_9_111': ('0', '0'),
                 '28_9_105': ('0', '0'),
                 '28_9_114': ('0', '0'),
                 '28_9_110': ('0', '0'),
                 '1036_5': ['12', '12', '12', '11'],
                 '1036_99': ['12', '11', '12', '11'],
                 '1036_6': ['00', '00', '11', '12'],
                 '1036_4': ['11', '12', '11', '11'],
                 '1036_3': ['11', '11', '11', '11'],
                 '22_1_2': ['11', '12', '11'],
                 '22_1_3': ['11', '11', '11'],
                 '22_1_4': ['11', '11', '11'],
                 '22_1_5': ['11', '11', '11'],
                 '22_1_99': ['12', '11', '12']},
                OrderedDict([('1036',
                              [['1036', '1036_1', '1:', '1:', '1:', '1:'],
                               ['1036', '1036_1', '?:', '?:', '?:', '?:'],
                               ['1036', '1036_2', '2:', '1:', '2:', '1:'],
                               ['1036', '1036_2', '?:', '?:', '?:', '?:'],
                               ['1036', '1036_6', '1:', '2:', '1:', '1:'],
                               ['1036', '1036_6', '1:', '1:', '1:', '2:'],
                               ['1036', '1036_99', '2:', '1|', '2|', '1|'],
                               ['1036', '1036_99', '1:', '1|', '1|', '1|'],
                               ['1036', '1036_5', '2:', '1|', '2|', '1|'],
                               ['1036', '1036_5', '1:', '2|', '1|', '1|'],
                               ['1036', '1036_4', '1:', '1|', '1|', '1|'],
                               ['1036', '1036_4', '1:', '2|', '1|', '1|'],
                               ['1036', '1036_3', '1:', '1|', '1|', '1\\'],
                               ['1036', '1036_3', '1:', '1|', '1|', '1\\']]),
                             ('22_1',
                              [['22_1', '22_1_10', '2:', '1:', '2:'],
                               ['22_1', '22_1_10', '1:', '1:', '1:'],
                               ['22_1', '22_1_20', '1:', '1:', '1:'],
                               ['22_1', '22_1_20', '1:', '2:', '1:'],
                               ['22_1', '22_1_99', '1:', '1|', '1|'],
                               ['22_1', '22_1_99', '2:', '1|', '2|'],
                               ['22_1', '22_1_5', '1:', '1|', '1|'],
                               ['22_1', '22_1_5', '1:', '1|', '1|'],
                               ['22_1', '22_1_4', '1:', '1|', '1|'],
                               ['22_1', '22_1_4', '1:', '1|', '1|'],
                               ['22_1', '22_1_3', '1:', '1|', '1|'],
                               ['22_1', '22_1_3', '1:', '1|', '1|'],
                               ['22_1', '22_1_2', '1:', '2|', '1|'],
                               ['22_1', '22_1_2', '1:', '1|', '1|']])]),
                {'V8-13273': 0.1548,
                 'V12-13302': 9.451e-05,
                 'V17-13417': 0.1197,
                 'V22-13687': 0.0001,
                 'V15-13380': 0.0088},
                {'1036': ['V8-13273', 'V12-13302', 'V17-13417', 'V22-13687'],
                 '22_1': ['V8-13273', 'V15-13380', 'V17-13417'],
                 '28_9': []},
                []],
               'output': [(('1036',
                  '1036_1',
                  '1,1',
                  '111,1',
                  '0,0',
                  'NULL,NULL'),
                 ('1036', '1036_2', '3,1', '212,1', '0,0', 'NULL,NULL'),
                 ('1036', '1036_6', '2,1', '121,1', '1,2', '111,2'),
                 ('1036', '1036_99', '3,1', '212,1', '1,1', '111,1'),
                 ('1036', '1036_5', '3,1', '212,1', '2,1', '121,1'),
                 ('1036', '1036_4', '1,1', '111,1', '2,1', '121,1'),
                 ('1036', '1036_3', '1,1', '111,1', '1,1', '111,1'),
                 ('22_1', '22_1_10', '3', '212', '1', '111'),
                 ('22_1', '22_1_20', '1', '111', '2', '121'),
                 ('22_1', '22_1_99', '1', '111', '3', '212'),
                 ('22_1', '22_1_5', '1', '111', '1', '111'),
                 ('22_1', '22_1_4', '1', '111', '1', '111'),
                 ('22_1', '22_1_3', '1', '111', '1', '111'),
                 ('22_1', '22_1_2', '2', '121', '1', '111')),
                {'28_9_103': ('0', '0'),
                 '28_9_106': ('0', '0'),
                 '28_9_108': ('0', '0'),
                 '28_9_109': ('0', '0'),
                 '28_9_111': ('0', '0'),
                 '28_9_105': ('0', '0'),
                 '28_9_114': ('0', '0'),
                 '28_9_110': ('0', '0'),
                 '1036_5': (['3', '1'], ['2', '1']),
                 '1036_99': (['3', '1'], ['1', '1']),
                 '1036_6': (['2', '1'], ['1', '2']),
                 '1036_4': (['1', '1'], ['2', '1']),
                 '1036_3': (['1', '1'], ['1', '1']),
                 '22_1_2': (['2'], ['1']),
                 '22_1_3': (['1'], ['1']),
                 '22_1_4': (['1'], ['1']),
                 '22_1_5': (['1'], ['1']),
                 '22_1_99': (['1'], ['3']),
                 '1036_1': (['1', '1'], ['0', '0']),
                 '1036_2': (['3', '1'], ['0', '0']),
                 '22_1_10': (['3'], ['1']),
                 '22_1_20': (['1'], ['2'])},
                2,
                OrderedDict([('1036',
                              ((0.9756108487483803,
                                9.22136964316592e-05,
                                0.02429693755518805),
                               (0.9999, 0.0001))),
                             ('22_1',
                              ((0.9673214995311192,
                                0.008588003627798477,
                                0.02409049684108229),))])]}),
             ('format',
              {'28_9_103': [('0', '0'), ('0', '0')],
               '28_9_106': [('0', '0'), ('0', '0')],
               '28_9_108': [('0', '0'), ('0', '0')],
               '28_9_109': [('0', '0'), ('0', '0')],
               '28_9_111': [('0', '0'), ('0', '0')],
               '28_9_105': [('0', '0'), ('0', '0')],
               '28_9_114': [('0', '0'), ('0', '0')],
               '28_9_110': [('0', '0'), ('0', '0')],
               '1036_5': [('3', '2'), ('1', '1')],
               '1036_99': [('3', '1'), ('1', '1')],
               '1036_6': [('2', '1'), ('1', '2')],
               '1036_4': [('1', '2'), ('1', '1')],
               '1036_3': [('1', '1'), ('1', '1')],
               '22_1_2': [('2', '1'), ('0', '0')],
               '22_1_3': [('1', '1'), ('0', '0')],
               '22_1_4': [('1', '1'), ('0', '0')],
               '22_1_5': [('1', '1'), ('0', '0')],
               '22_1_99': [('1', '3'), ('0', '0')],
               '1036_1': [('1', '0'), ('1', '0')],
               '1036_2': [('3', '0'), ('1', '0')],
               '22_1_10': [('3', '1'), ('0', '0')],
               '22_1_20': [('1', '2'), ('0', '0')]})])
{% endraw %} {% raw %}
a1 = {k:env.dtest[k] for k in ['MC1R','PAPPA2']}
{% endraw %} {% raw %}
import pickle
with open('dtestpy3_fixedcoder.pickle', 'wb') as handle:
    pickle.dump(a1, handle, protocol=pickle.HIGHEST_PROTOCOL)

#with open('dtestpy3_fixedcoder.pickle', 'rb') as handle:
#    b = pickle.load(handle)
{% endraw %} {% raw %}
a = {k:env.dtest[k] for k in ['MC1R','PAPPA2']}
{% endraw %} {% raw %}
import pickle
with open('dtestpy3.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

#with open('dtestpy3.pickle', 'rb') as handle:
#    b = pickle.load(handle)
{% endraw %} {% raw %}
a['MC1R']
OrderedDict([('dregions',
              [['16',
                '89984286',
                '89987385',
                'MC1R',
                '133.689089888',
                '159.050776809',
                '111.195245154']]),
             ('dvariants',
              [[['16', 89984370, 'MC1R', 0.00015],
                ['16', 89984604, 'MC1R', 0.00015],
                ['16', 89984739, 'MC1R', 0.00015],
                ['16', 89985940, 'MC1R', 0.085784],
                ['16', 89986608, 'MC1R', 0.107936],
                ['16', 89986760, 'MC1R', 0.00015],
                ['16', 89987201, 'MC1R', 0.00015]]]),
             ('dfamvaridx',
              [{'1': [0, 1, 2, 3, 4, 5, 6], '2': [0, 1, 2, 3, 4, 5, 6]}]),
             ('dgeno',
              [{'II:2': ['12', '12', '11', '11', '11', '12', '12'],
                'I:2': ['12', '12', '11', '11', '11', '12', '12'],
                'I:1': ['12', '12', '11', '11', '11', '11', '12'],
                'II:3': ['12', '12', '11', '11', '11', '12', '12'],
                'II:4': ['20', '12', '12', '12', '12', '12', '22'],
                'II:1': ['22', '22', '11', '11', '11', '12', '22'],
                'II:B': ['11', '11', '11', '11', '11', '11', '12'],
                'I:B': ['12', '12', '12', '12', '12', '12', '11'],
                'I:A': ['11', '11', '11', '11', '11', '11', '22'],
                'II:C': ['11', '11', '11', '11', '11', '11', '12'],
                'II:D': ['12', '12', '12', '12', '12', '12', '12'],
                'II:A': ['12', '12', '12', '12', '12', '12', '12']}]),
             ('gss',
              [{0: {'1': ('12', '12', '12', '12', '22', '22'),
                 '2': ('12', '11', '22', '12', '12', '12')},
                1: {'1': ('12', '12', '12', '12', '22', '22'),
                 '2': ('12', '11', '22', '12', '12', '12')},
                2: {'1': ('12', '12', '12', '12', '22', '22'),
                 '2': ('12', '11', '22', '12', '12', '12')},
                3: {'1': ('12', '12', '12', '12', '22', '22'),
                 '2': ('12', '11', '22', '12', '12', '12')},
                4: {'1': ('12', '12', '12', '12', '22', '22'),
                 '2': ('12', '11', '22', '12', '12', '12')},
                5: {'1': ('12', '12', '12', '12', '22', '22'),
                 '2': ('12', '11', '22', '12', '12', '12')},
                6: {'1': ('12', '12', '12', '12', '22', '22'),
                 '2': ('12', '11', '22', '12', '12', '12')}}]),
             ('hapimp',
              {'1': ['1',
                ['V0-89984370',
                 'V1-89984604',
                 'V2-89984739',
                 'V3-89985940',
                 'V4-89986608',
                 'V5-89986760',
                 'V6-89987201'],
                [89984370,
                 89984604,
                 89984739,
                 89985940,
                 89986608,
                 89986760,
                 89987201],
                [0.00015,
                 0.00015,
                 0.00015,
                 0.085784,
                 0.107936,
                 0.00015,
                 0.00015],
                (('1', 'I:1', '1:', '1:', '1:', '1:', '1:', '1:', '1:'),
                 ('1', 'I:1', '2:', '2:', '1:', '1:', '1:', '1:', '2:'),
                 ('1', 'I:2', '2:', '2:', '1:', '1:', '1:', '2:', '2:'),
                 ('1', 'I:2', '1:', '1:', '1:', '1:', '1:', '1:', '1:'),
                 ('1', 'II:3', '2:', '2|', '1:', '1:', '1:', '2|', '2|'),
                 ('1', 'II:3', '1:', '1|', '1:', '1:', '1:', '1|', '1|'),
                 ('1', 'II:2', '2:', '2|', '1:', '1:', '1:', '2|', '2|'),
                 ('1', 'II:2', '1:', '1|', '1:', '1:', '1:', '1|', '1|'),
                 ('1', 'II:1', '2:', '2|', '1:', '1:', '1:', '2|', '2|'),
                 ('1', 'II:1', '2:', '2|', '1:', '1:', '1:', '1|', '2|'),
                 ('1', 'II:4', '2:', '2|', '1:', '1:', '1:', '2\\', '2|'),
                 ('1', 'II:4', '1:', '1|', '1:', '1:', '1:', '1\\', '2|'))],
               '2': ['2',
                ['V0-89984370',
                 'V1-89984604',
                 'V2-89984739',
                 'V3-89985940',
                 'V4-89986608',
                 'V5-89986760',
                 'V6-89987201'],
                [89984370,
                 89984604,
                 89984739,
                 89985940,
                 89986608,
                 89986760,
                 89987201],
                [0.00015,
                 0.00015,
                 0.00015,
                 0.085784,
                 0.107936,
                 0.00015,
                 0.00015],
                (('2', 'I:A', '1:', '1:', '1:', '1:', '1:', '1:', '2:'),
                 ('2', 'I:A', '1:', '1:', '1:', '1:', '1:', '1:', '2:'),
                 ('2', 'I:B', '2:', '2:', '2:', '2:', '2:', '2:', '1:'),
                 ('2', 'I:B', '1:', '1:', '1:', '1:', '1:', '1:', '1:'),
                 ('2', 'II:D', '2:', '2|', '2|', '2|', '2|', '2|', '1:'),
                 ('2', 'II:D', '1:', '1|', '1|', '1|', '1|', '1|', '2:'),
                 ('2', 'II:C', '1:', '1|', '1|', '1|', '1|', '1|', '1:'),
                 ('2', 'II:C', '1:', '1|', '1|', '1|', '1|', '1|', '2:'),
                 ('2', 'II:B', '1:', '1|', '1|', '1|', '1|', '1|', '1:'),
                 ('2', 'II:B', '1:', '1|', '1|', '1|', '1|', '1|', '2:'),
                 ('2', 'II:A', '2:', '2|', '2|', '2|', '2|', '2|', '1:'),
                 ('2', 'II:A', '1:', '1|', '1|', '1|', '1|', '1|', '2:')),
                {'V0-89984370': 0.00015,
                 'V1-89984604': 0.00015,
                 'V2-89984739': 0.00015,
                 'V3-89985940': 0.085784,
                 'V4-89986608': 0.107936,
                 'V5-89986760': 0.00015,
                 'V6-89987201': 0.00015}]}),
             ('ld',
              [[[None],
                [1.0, None],
                [0.2380952380952381, 0.2380952380952381, None],
                [0.2380952380952381, 0.2380952380952381, 1.0, None],
                [0.2380952380952381, 0.2380952380952381, 1.0, 1.0, None],
                [0.5555555555555556,
                 0.5555555555555556,
                 0.42857142857142866,
                 0.42857142857142866,
                 0.42857142857142866,
                 None],
                [0.06666666666666665,
                 0.06666666666666665,
                 0.14285714285714282,
                 0.14285714285714282,
                 0.14285714285714282,
                 0.0,
                 None]],
               [[0, 1], [2, 3, 4], [3, 4]],
               [['V0-89984370', 'V1-89984604'],
                ['V2-89984739', 'V3-89985940', 'V4-89986608']]]),
             ('coder',
              {'input': [{'II:2': ['12', '12', '11', '11', '11', '12', '12'],
                 'I:2': ['12', '12', '11', '11', '11', '12', '12'],
                 'I:1': ['12', '12', '11', '11', '11', '11', '12'],
                 'II:3': ['12', '12', '11', '11', '11', '12', '12'],
                 'II:4': ['20', '12', '12', '12', '12', '12', '22'],
                 'II:1': ['22', '22', '11', '11', '11', '12', '22'],
                 'II:B': ['11', '11', '11', '11', '11', '11', '12'],
                 'I:B': ['12', '12', '12', '12', '12', '12', '11'],
                 'I:A': ['11', '11', '11', '11', '11', '11', '22'],
                 'II:C': ['11', '11', '11', '11', '11', '11', '12'],
                 'II:D': ['12', '12', '12', '12', '12', '12', '12'],
                 'II:A': ['12', '12', '12', '12', '12', '12', '12']},
                OrderedDict([('1',
                              [['1',
                                'I:1',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:'],
                               ['1',
                                'I:1',
                                '2:',
                                '2:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '2:'],
                               ['1',
                                'I:2',
                                '2:',
                                '2:',
                                '1:',
                                '1:',
                                '1:',
                                '2:',
                                '2:'],
                               ['1',
                                'I:2',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:'],
                               ['1',
                                'II:3',
                                '2:',
                                '2|',
                                '1:',
                                '1:',
                                '1:',
                                '2|',
                                '2|'],
                               ['1',
                                'II:3',
                                '1:',
                                '1|',
                                '1:',
                                '1:',
                                '1:',
                                '1|',
                                '1|'],
                               ['1',
                                'II:2',
                                '2:',
                                '2|',
                                '1:',
                                '1:',
                                '1:',
                                '2|',
                                '2|'],
                               ['1',
                                'II:2',
                                '1:',
                                '1|',
                                '1:',
                                '1:',
                                '1:',
                                '1|',
                                '1|'],
                               ['1',
                                'II:1',
                                '2:',
                                '2|',
                                '1:',
                                '1:',
                                '1:',
                                '2|',
                                '2|'],
                               ['1',
                                'II:1',
                                '2:',
                                '2|',
                                '1:',
                                '1:',
                                '1:',
                                '1|',
                                '2|'],
                               ['1',
                                'II:4',
                                '2:',
                                '2|',
                                '1:',
                                '1:',
                                '1:',
                                '2\\',
                                '2|'],
                               ['1',
                                'II:4',
                                '1:',
                                '1|',
                                '1:',
                                '1:',
                                '1:',
                                '1\\',
                                '2|']]),
                             ('2',
                              [['2',
                                'I:A',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '2:'],
                               ['2',
                                'I:A',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '2:'],
                               ['2',
                                'I:B',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '1:'],
                               ['2',
                                'I:B',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:'],
                               ['2',
                                'II:D',
                                '2:',
                                '2|',
                                '2|',
                                '2|',
                                '2|',
                                '2|',
                                '1:'],
                               ['2',
                                'II:D',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '2:'],
                               ['2',
                                'II:C',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1:'],
                               ['2',
                                'II:C',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '2:'],
                               ['2',
                                'II:B',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1:'],
                               ['2',
                                'II:B',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '2:'],
                               ['2',
                                'II:A',
                                '2:',
                                '2|',
                                '2|',
                                '2|',
                                '2|',
                                '2|',
                                '1:'],
                               ['2',
                                'II:A',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '2:']])]),
                {'V0-89984370': 0.00015,
                 'V1-89984604': 0.00015,
                 'V2-89984739': 0.00015,
                 'V3-89985940': 0.085784,
                 'V4-89986608': 0.107936,
                 'V5-89986760': 0.00015,
                 'V6-89987201': 0.00015},
                {'1': ['V0-89984370',
                  'V1-89984604',
                  'V2-89984739',
                  'V3-89985940',
                  'V4-89986608',
                  'V5-89986760',
                  'V6-89987201'],
                 '2': ['V0-89984370',
                  'V1-89984604',
                  'V2-89984739',
                  'V3-89985940',
                  'V4-89986608',
                  'V5-89986760',
                  'V6-89987201']},
                [['V0-89984370', 'V1-89984604'],
                 ['V2-89984739', 'V3-89985940', 'V4-89986608']]],
               'output': [(('1', 'I:1', '1,1', '1111,11', '2,2', '2222,12'),
                 ('1', 'I:2', '2,3', '2222,22', '1,1', '1111,11'),
                 ('1', 'II:3', '2,3', '2222,22', '1,1', '1111,11'),
                 ('1', 'II:2', '2,3', '2222,22', '1,1', '1111,11'),
                 ('1', 'II:1', '2,3', '2222,22', '2,2', '2222,12'),
                 ('1', 'II:4', '2,3', '2222,22', '1,2', '1111,12'),
                 ('2', 'I:A', '2', '111112', '2', '111112'),
                 ('2', 'I:B', '3', '222221', '1', '111111'),
                 ('2', 'II:D', '3', '222221', '2', '111112'),
                 ('2', 'II:C', '1', '111111', '2', '111112'),
                 ('2', 'II:B', '1', '111111', '2', '111112'),
                 ('2', 'II:A', '3', '222221', '2', '111112')),
                {'II:2': (['2', '3'], ['2222', '22']),
                 'I:2': (['2', '3'], ['2222', '22']),
                 'I:1': (['1', '1'], ['1111', '11']),
                 'II:3': (['2', '3'], ['2222', '22']),
                 'II:4': (['2', '3'], ['2222', '22']),
                 'II:1': (['2', '3'], ['2222', '22']),
                 'II:B': (['1'], ['111111']),
                 'I:B': (['3'], ['222221']),
                 'I:A': (['2'], ['111112']),
                 'II:C': (['1'], ['111111']),
                 'II:D': (['3'], ['222221']),
                 'II:A': (['3'], ['222221'])},
                2,
                OrderedDict([('1',
                              ((0.999999999999997, 3.0400071909574507e-15),
                               (0.9995498650574084,
                                0.000449932412250011,
                                2.0253034157679256e-07))),
                             ('2',
                              ((0.999550067496625,
                                0.0004499325033749946,
                                1.3677980457054751e-18),))])]}),
             ('format',
              {'II:2': [('2', '2222'), ('3', '22')],
               'I:2': [('2', '2222'), ('3', '22')],
               'I:1': [('1', '1111'), ('1', '11')],
               'II:3': [('2', '2222'), ('3', '22')],
               'II:4': [('2', '2222'), ('3', '22')],
               'II:1': [('2', '2222'), ('3', '22')],
               'II:B': [('1', '111111'), ('0', '0')],
               'I:B': [('3', '222221'), ('0', '0')],
               'I:A': [('2', '111112'), ('0', '0')],
               'II:C': [('1', '111111'), ('0', '0')],
               'II:D': [('3', '222221'), ('0', '0')],
               'II:A': [('3', '222221'), ('0', '0')]})])
{% endraw %} {% raw %}
a['PAPPA2']
OrderedDict([('dregions',
              [['1',
                '176432306',
                '176811970',
                'PAPPA2',
                '186.278964324',
                '238.991541401',
                '136.402021932']]),
             ('dvariants',
              [[['1', 176659933, 'PAPPA2', 0.00015],
                ['1', 176660247, 'PAPPA2', 0.00015],
                ['1', 176660371, 'PAPPA2', 0.00015],
                ['1', 176664842, 'PAPPA2', 0.241943],
                ['1', 176668823, 'PAPPA2', 0.00015],
                ['1', 176671914, 'PAPPA2', 0.00015],
                ['1', 176679384, 'PAPPA2', 0.00015],
                ['1', 176734756, 'PAPPA2', 0.352899],
                ['1', 176811754, 'PAPPA2', 0.00015],
                ['1', 176811873, 'PAPPA2', 0.00015]]]),
             ('dfamvaridx',
              [{'1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                '2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}]),
             ('dgeno',
              [{'II:2': ['12',
                 '11',
                 '12',
                 '11',
                 '12',
                 '11',
                 '11',
                 '12',
                 '11',
                 '12'],
                'I:2': ['11',
                 '11',
                 '11',
                 '11',
                 '11',
                 '11',
                 '11',
                 '12',
                 '12',
                 '11'],
                'I:1': ['12',
                 '12',
                 '22',
                 '12',
                 '22',
                 '12',
                 '12',
                 '22',
                 '12',
                 '12'],
                'II:3': ['12',
                 '11',
                 '12',
                 '11',
                 '12',
                 '11',
                 '11',
                 '12',
                 '11',
                 '12'],
                'II:4': ['11',
                 '12',
                 '12',
                 '12',
                 '12',
                 '12',
                 '12',
                 '22',
                 '22',
                 '11'],
                'II:1': ['12',
                 '11',
                 '12',
                 '11',
                 '12',
                 '11',
                 '11',
                 '12',
                 '11',
                 '12'],
                'II:B': ['12',
                 '11',
                 '11',
                 '12',
                 '12',
                 '12',
                 '12',
                 '11',
                 '11',
                 '11'],
                'I:B': ['12',
                 '11',
                 '12',
                 '12',
                 '22',
                 '22',
                 '12',
                 '12',
                 '12',
                 '12'],
                'I:A': ['11',
                 '12',
                 '11',
                 '11',
                 '11',
                 '11',
                 '11',
                 '11',
                 '11',
                 '11'],
                'II:C': ['12',
                 '11',
                 '11',
                 '12',
                 '12',
                 '12',
                 '12',
                 '11',
                 '11',
                 '11'],
                'II:D': ['11',
                 '11',
                 '12',
                 '11',
                 '12',
                 '12',
                 '11',
                 '12',
                 '12',
                 '12'],
                'II:A': ['11',
                 '12',
                 '12',
                 '11',
                 '12',
                 '12',
                 '11',
                 '12',
                 '12',
                 '12']}]),
             ('gss',
              [{0: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                1: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                2: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                3: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                4: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                5: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                6: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                7: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                8: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')},
                9: {'1': ('12', '11', '12', '12', '11', '12'),
                 '2': ('11', '12', '11', '11', '12', '12')}}]),
             ('hapimp',
              {'1': ['1',
                ['V0-176659933',
                 'V1-176660247',
                 'V2-176660371',
                 'V3-176664842',
                 'V4-176668823',
                 'V5-176671914',
                 'V6-176679384',
                 'V7-176734756',
                 'V8-176811754',
                 'V9-176811873'],
                [176659933,
                 176660247,
                 176660371,
                 176664842,
                 176668823,
                 176671914,
                 176679384,
                 176734756,
                 176811754,
                 176811873],
                [0.00015,
                 0.00015,
                 0.00015,
                 0.241943,
                 0.00015,
                 0.00015,
                 0.00015,
                 0.352899,
                 0.00015,
                 0.00015],
                (('1',
                  'I:1',
                  '1:',
                  '2:',
                  '2:',
                  '2:',
                  '2:',
                  '2:',
                  '2:',
                  '2:',
                  '2:',
                  '1:'),
                 ('1',
                  'I:1',
                  '2:',
                  '1:',
                  '2:',
                  '1:',
                  '2:',
                  '1:',
                  '1:',
                  '2:',
                  '1:',
                  '2:'),
                 ('1',
                  'I:2',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '2:',
                  '2:',
                  '1:'),
                 ('1',
                  'I:2',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:'),
                 ('1',
                  'II:4',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1|',
                  '2|',
                  '2|',
                  '1|'),
                 ('1',
                  'II:4',
                  '1:',
                  '2|',
                  '2:',
                  '2|',
                  '2:',
                  '2|',
                  '2|',
                  '2|',
                  '2|',
                  '1|'),
                 ('1',
                  'II:3',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1|',
                  '1|'),
                 ('1',
                  'II:3',
                  '2:',
                  '1|',
                  '2:',
                  '1|',
                  '2:',
                  '1|',
                  '1|',
                  '2|',
                  '1|',
                  '2|'),
                 ('1',
                  'II:2',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1|',
                  '1|'),
                 ('1',
                  'II:2',
                  '2:',
                  '1|',
                  '2:',
                  '1|',
                  '2:',
                  '1|',
                  '1|',
                  '2|',
                  '1|',
                  '2|'),
                 ('1',
                  'II:1',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1|',
                  '1|'),
                 ('1',
                  'II:1',
                  '2:',
                  '1|',
                  '2:',
                  '1|',
                  '2:',
                  '1|',
                  '1|',
                  '2|',
                  '1|',
                  '2|'))],
               '2': ['2',
                ['V0-176659933',
                 'V1-176660247',
                 'V2-176660371',
                 'V3-176664842',
                 'V4-176668823',
                 'V5-176671914',
                 'V6-176679384',
                 'V7-176734756',
                 'V8-176811754',
                 'V9-176811873'],
                [176659933,
                 176660247,
                 176660371,
                 176664842,
                 176668823,
                 176671914,
                 176679384,
                 176734756,
                 176811754,
                 176811873],
                [0.00015,
                 0.00015,
                 0.00015,
                 0.241943,
                 0.00015,
                 0.00015,
                 0.00015,
                 0.352899,
                 0.00015,
                 0.00015],
                (('2',
                  'I:A',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:'),
                 ('2',
                  'I:A',
                  '1:',
                  '2:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:',
                  '1:'),
                 ('2',
                  'I:B',
                  '1:',
                  '1:',
                  '2:',
                  '1:',
                  '2:',
                  '2:',
                  '1:',
                  '2:',
                  '2:',
                  '2:'),
                 ('2',
                  'I:B',
                  '2:',
                  '1:',
                  '1:',
                  '2:',
                  '2:',
                  '2:',
                  '2:',
                  '1:',
                  '1:',
                  '1:'),
                 ('2',
                  'II:D',
                  '1:',
                  '1|',
                  '2|',
                  '1|',
                  '2:',
                  '2:',
                  '1|',
                  '2|',
                  '2|',
                  '2|'),
                 ('2',
                  'II:D',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1:',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1|'),
                 ('2',
                  'II:C',
                  '2:',
                  '1|',
                  '1|',
                  '2|',
                  '2:',
                  '2:',
                  '2|',
                  '1|',
                  '1|',
                  '1|'),
                 ('2',
                  'II:C',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1:',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1|'),
                 ('2',
                  'II:B',
                  '2:',
                  '1|',
                  '1|',
                  '2|',
                  '2:',
                  '2:',
                  '2|',
                  '1|',
                  '1|',
                  '1|'),
                 ('2',
                  'II:B',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1:',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1|'),
                 ('2',
                  'II:A',
                  '1:',
                  '1|',
                  '2|',
                  '1|',
                  '2:',
                  '2:',
                  '1|',
                  '2|',
                  '2|',
                  '2|'),
                 ('2',
                  'II:A',
                  '1:',
                  '2|',
                  '1|',
                  '1|',
                  '1:',
                  '1:',
                  '1|',
                  '1|',
                  '1|',
                  '1|')),
                {'V0-176659933': 0.00015,
                 'V1-176660247': 0.00015,
                 'V2-176660371': 0.00015,
                 'V3-176664842': 0.241943,
                 'V4-176668823': 0.00015,
                 'V5-176671914': 0.00015,
                 'V6-176679384': 0.00015,
                 'V7-176734756': 0.352899,
                 'V8-176811754': 0.00015,
                 'V9-176811873': 0.00015}]}),
             ('ld',
              [[[None],
                [0.1111111111111111, None],
                [0.022222222222222223, 0.022222222222222223, None],
                [0.1111111111111111,
                 0.1111111111111111,
                 0.022222222222222223,
                 None],
                [0.3333333333333334,
                 0.0,
                 0.6000000000000001,
                 0.3333333333333334,
                 None],
                [0.022222222222222223,
                 0.022222222222222223,
                 0.2177777777777778,
                 0.5555555555555556,
                 0.6000000000000001,
                 None],
                [0.1111111111111111,
                 0.1111111111111111,
                 0.022222222222222223,
                 1.0,
                 0.3333333333333334,
                 0.5555555555555556,
                 None],
                [0.0,
                 0.0,
                 0.6000000000000001,
                 0.0,
                 0.25,
                 0.06666666666666665,
                 0.0,
                 None],
                [0.19999999999999998,
                 0.022222222222222223,
                 0.2177777777777778,
                 0.022222222222222223,
                 0.06666666666666665,
                 0.2177777777777778,
                 0.022222222222222223,
                 0.6000000000000001,
                 None],
                [0.1111111111111111,
                 0.1111111111111111,
                 0.5555555555555556,
                 0.1111111111111111,
                 0.3333333333333334,
                 0.022222222222222223,
                 0.1111111111111111,
                 0.3333333333333334,
                 0.022222222222222223,
                 None]],
               [[3, 6]],
               [['V3-176664842', 'V6-176679384']]]),
             ('coder',
              {'input': [{'II:2': ['12',
                  '11',
                  '12',
                  '11',
                  '12',
                  '11',
                  '11',
                  '12',
                  '11',
                  '12'],
                 'I:2': ['11',
                  '11',
                  '11',
                  '11',
                  '11',
                  '11',
                  '11',
                  '12',
                  '12',
                  '11'],
                 'I:1': ['12',
                  '12',
                  '22',
                  '12',
                  '22',
                  '12',
                  '12',
                  '22',
                  '12',
                  '12'],
                 'II:3': ['12',
                  '11',
                  '12',
                  '11',
                  '12',
                  '11',
                  '11',
                  '12',
                  '11',
                  '12'],
                 'II:4': ['11',
                  '12',
                  '12',
                  '12',
                  '12',
                  '12',
                  '12',
                  '22',
                  '22',
                  '11'],
                 'II:1': ['12',
                  '11',
                  '12',
                  '11',
                  '12',
                  '11',
                  '11',
                  '12',
                  '11',
                  '12'],
                 'II:B': ['12',
                  '11',
                  '11',
                  '12',
                  '12',
                  '12',
                  '12',
                  '11',
                  '11',
                  '11'],
                 'I:B': ['12',
                  '11',
                  '12',
                  '12',
                  '22',
                  '22',
                  '12',
                  '12',
                  '12',
                  '12'],
                 'I:A': ['11',
                  '12',
                  '11',
                  '11',
                  '11',
                  '11',
                  '11',
                  '11',
                  '11',
                  '11'],
                 'II:C': ['12',
                  '11',
                  '11',
                  '12',
                  '12',
                  '12',
                  '12',
                  '11',
                  '11',
                  '11'],
                 'II:D': ['11',
                  '11',
                  '12',
                  '11',
                  '12',
                  '12',
                  '11',
                  '12',
                  '12',
                  '12'],
                 'II:A': ['11',
                  '12',
                  '12',
                  '11',
                  '12',
                  '12',
                  '11',
                  '12',
                  '12',
                  '12']},
                OrderedDict([('1',
                              [['1',
                                'I:1',
                                '1:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '1:'],
                               ['1',
                                'I:1',
                                '2:',
                                '1:',
                                '2:',
                                '1:',
                                '2:',
                                '1:',
                                '1:',
                                '2:',
                                '1:',
                                '2:'],
                               ['1',
                                'I:2',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '2:',
                                '2:',
                                '1:'],
                               ['1',
                                'I:2',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:'],
                               ['1',
                                'II:4',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1|',
                                '2|',
                                '2|',
                                '1|'],
                               ['1',
                                'II:4',
                                '1:',
                                '2|',
                                '2:',
                                '2|',
                                '2:',
                                '2|',
                                '2|',
                                '2|',
                                '2|',
                                '1|'],
                               ['1',
                                'II:3',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|'],
                               ['1',
                                'II:3',
                                '2:',
                                '1|',
                                '2:',
                                '1|',
                                '2:',
                                '1|',
                                '1|',
                                '2|',
                                '1|',
                                '2|'],
                               ['1',
                                'II:2',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|'],
                               ['1',
                                'II:2',
                                '2:',
                                '1|',
                                '2:',
                                '1|',
                                '2:',
                                '1|',
                                '1|',
                                '2|',
                                '1|',
                                '2|'],
                               ['1',
                                'II:1',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|',
                                '1|'],
                               ['1',
                                'II:1',
                                '2:',
                                '1|',
                                '2:',
                                '1|',
                                '2:',
                                '1|',
                                '1|',
                                '2|',
                                '1|',
                                '2|']]),
                             ('2',
                              [['2',
                                'I:A',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:'],
                               ['2',
                                'I:A',
                                '1:',
                                '2:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:',
                                '1:'],
                               ['2',
                                'I:B',
                                '1:',
                                '1:',
                                '2:',
                                '1:',
                                '2:',
                                '2:',
                                '1:',
                                '2:',
                                '2:',
                                '2:'],
                               ['2',
                                'I:B',
                                '2:',
                                '1:',
                                '1:',
                                '2:',
                                '2:',
                                '2:',
                                '2:',
                                '1:',
                                '1:',
                                '1:'],
                               ['2',
                                'II:D',
                                '1:',
                                '1|',
                                '2|',
                                '1|',
                                '2:',
                                '2:',
                                '1|',
                                '2|',
                                '2|',
                                '2|'],
                               ['2',
                                'II:D',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1:',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|'],
                               ['2',
                                'II:C',
                                '2:',
                                '1|',
                                '1|',
                                '2|',
                                '2:',
                                '2:',
                                '2|',
                                '1|',
                                '1|',
                                '1|'],
                               ['2',
                                'II:C',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1:',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|'],
                               ['2',
                                'II:B',
                                '2:',
                                '1|',
                                '1|',
                                '2|',
                                '2:',
                                '2:',
                                '2|',
                                '1|',
                                '1|',
                                '1|'],
                               ['2',
                                'II:B',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1:',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|'],
                               ['2',
                                'II:A',
                                '1:',
                                '1|',
                                '2|',
                                '1|',
                                '2:',
                                '2:',
                                '1|',
                                '2|',
                                '2|',
                                '2|'],
                               ['2',
                                'II:A',
                                '1:',
                                '2|',
                                '1|',
                                '1|',
                                '1:',
                                '1:',
                                '1|',
                                '1|',
                                '1|',
                                '1|']])]),
                {'V0-176659933': 0.00015,
                 'V1-176660247': 0.00015,
                 'V2-176660371': 0.00015,
                 'V3-176664842': 0.241943,
                 'V4-176668823': 0.00015,
                 'V5-176671914': 0.00015,
                 'V6-176679384': 0.00015,
                 'V7-176734756': 0.352899,
                 'V8-176811754': 0.00015,
                 'V9-176811873': 0.00015},
                {'1': ['V0-176659933',
                  'V1-176660247',
                  'V2-176660371',
                  'V3-176664842',
                  'V4-176668823',
                  'V5-176671914',
                  'V6-176679384',
                  'V7-176734756',
                  'V8-176811754',
                  'V9-176811873'],
                 '2': ['V0-176659933',
                  'V1-176660247',
                  'V2-176660371',
                  'V3-176664842',
                  'V4-176668823',
                  'V5-176671914',
                  'V6-176679384',
                  'V7-176734756',
                  'V8-176811754',
                  'V9-176811873']},
                [['V3-176664842', 'V6-176679384']]],
               'output': [(('1', 'I:1', '4', '1222222221', '3', '2122212212'),
                 ('1', 'I:2', '2', '1111111221', '1', '1111111111'),
                 ('1', 'II:4', '2', '1111111221', '4', '1222222221'),
                 ('1', 'II:3', '1', '1111111111', '3', '2122212212'),
                 ('1', 'II:2', '1', '1111111111', '3', '2122212212'),
                 ('1', 'II:1', '1', '1111111111', '3', '2122212212'),
                 ('2', 'I:A', '1', '1111111111', '2', '1212112111'),
                 ('2', 'I:B', '4', '1121221222', '3', '2112222111'),
                 ('2', 'II:D', '4', '1121221222', '1', '1111111111'),
                 ('2', 'II:C', '3', '2112222111', '1', '1111111111'),
                 ('2', 'II:B', '3', '2112222111', '1', '1111111111'),
                 ('2', 'II:A', '4', '1121221222', '2', '1212112111')),
                {'II:2': (['1'], ['1111111111']),
                 'I:2': (['2'], ['1111111221']),
                 'I:1': (['4'], ['1222222221']),
                 'II:3': (['1'], ['1111111111']),
                 'II:4': (['2'], ['1111111221']),
                 'II:1': (['1'], ['1111111111']),
                 'II:B': (['3'], ['2112222111']),
                 'I:B': (['4'], ['1121221222']),
                 'I:A': (['1'], ['1111111111']),
                 'II:C': (['3'], ['2112222111']),
                 'II:D': (['4'], ['1121221222']),
                 'II:A': (['4'], ['1121221222'])},
                1,
                OrderedDict([('1',
                              ((0.9998363838109469,
                                0.00016361618905309718,
                                1.3225147758256331e-20,
                                7.937469761930493e-24),)),
                             ('2',
                              ((0.9999999928166995,
                                7.183300126057594e-09,
                                3.233697565987606e-16,
                                1.6580038589959425e-19),))])]}),
             ('format',
              {'II:2': [('1', '1111111111')],
               'I:2': [('2', '1111111221')],
               'I:1': [('4', '1222222221')],
               'II:3': [('1', '1111111111')],
               'II:4': [('2', '1111111221')],
               'II:1': [('1', '1111111111')],
               'II:B': [('3', '2112222111')],
               'I:B': [('4', '1121221222')],
               'I:A': [('1', '1111111111')],
               'II:C': [('3', '2112222111')],
               'II:D': [('4', '1121221222')],
               'II:A': [('4', '1121221222')]})])
{% endraw %}

One region test

{% raw %}
region = regions[1]
{% endraw %} {% raw %}
extractor.getRegion(region)
maker.getRegion(region)
writer.getRegion(region)  
extractor.apply(data)
0
{% endraw %} {% raw %}
maker.apply(data)
data {'II:2': ['12', '12', '11', '11', '11', '12', '12'], 'I:2': ['12', '12', '11', '11', '11', '12', '12'], 'I:1': ['12', '12', '11', '11', '11', '11', '12'], 'II:3': ['12', '12', '11', '11', '11', '12', '12'], 'II:4': ['20', '12', '12', '12', '12', '12', '22'], 'II:1': ['22', '22', '11', '11', '11', '12', '22'], 'II:B': ['11', '11', '11', '11', '11', '11', '12'], 'I:B': ['12', '12', '12', '12', '12', '12', '11'], 'I:A': ['11', '11', '11', '11', '11', '11', '22'], 'II:C': ['11', '11', '11', '11', '11', '11', '12'], 'II:D': ['12', '12', '12', '12', '12', '12', '12'], 'II:A': ['12', '12', '12', '12', '12', '12', '12']}
Estimating allele frequencies... [using maximum likelihood]0
   V0-89984370 V1-89984604 V2-89984739 V3-89985940 V4-89986608 V5-89986760 
   V6-89987201 

V0-89984370: 0 0.5 0.5 
total familyCount:1
V1-89984604: 0 0.5 0.5 
total familyCount:1
V2-89984739: 0 1 
total familyCount:1
V3-89985940: 0 1 
total familyCount:1
V4-89986608: 0 1 
total familyCount:1
V5-89986760: 0 0.75 0.25 
total familyCount:1
V6-89987201: 0 0.5 0.5 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V0-89984370 V1-89984604 V2-89984739 V3-89985940 V4-89986608 V5-89986760 
   V6-89987201 

V0-89984370: 0 0.75 0.25 
total familyCount:1
V1-89984604: 0 0.75 0.25 
total familyCount:1
V2-89984739: 0 0.75 0.25 
total familyCount:1
V3-89985940: 0 0.75 0.25 
total familyCount:1
V4-89986608: 0 0.75 0.25 
total familyCount:1
V5-89986760: 0 0.75 0.25 
total familyCount:1
V6-89987201: 0 0.5 0.5 
total familyCount:1
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/1974374.1.plot.q/ipykernel_14355/1565046380.py in <module>
----> 1 maker.apply(data)

/tmp/1974374.1.plot.q/ipykernel_14355/1421714082.py in apply(self, data)
    119             else:
    120                 # calculate LD clusters using founder haplotypes
--> 121                 clusters = self.__ClusterByLD(data, haplotypes, varnames)
    122                 print('clusters:',clusters)
    123                 # recoding the genotype of the region

/tmp/1974374.1.plot.q/ipykernel_14355/1421714082.py in __ClusterByLD(self, data, haplotypes, varnames)
    236                 founder_haplotypes.append(("{}-{}".format(hap[1], ihap % 2), "".join([x[1] if x[0].isupper() else x[0] for x in gt])))
    237         # calculate LD blocks, use r2 measure
--> 238         ld = Align.create(founder_haplotypes).matrixLD(validCharacters="12")["r2"]
    239         blocks = []
    240         for j in ld:

~/miniconda3/envs/seqpy3v0/lib/python3.9/site-packages/egglib/_interface.py in create(cls, source, alphabet)
    582 
    583         if not isinstance(source, (Align, Container)):
--> 584             if alphabet is None: raise ValueError('alphabet is required for object creation from an iterable')
    585             new_instance.__init__(alphabet)
    586             new_instance.add_samples(source)

ValueError: alphabet is required for object creation from an iterable
{% endraw %} {% raw %}
haplotypes = OrderedDict()
mafs = {}   ##Per fam per variant
varnames = {}
{% endraw %} {% raw %}
maker._MarkerMaker__Haplotype(data, haplotypes, mafs, varnames)
{% endraw %} {% raw %}
items = list(data.families.keys())
{% endraw %} {% raw %}
item = items[1]
{% endraw %} {% raw %}
print('running family',item)
varnames[item], positions, vcf_mafs = data.getFamVariants(item, style = "map")
if len(varnames[item]) == 0:
    print('here')
    for person in data.families[item]:
        data[person] = maker.missings
running family 1007
{% endraw %} {% raw %}
varnames[item]
['V8-13273', 'V17-13417']
{% endraw %} {% raw %}
tmp_log_output=env.tmp_log + str(os.getpid())
maker.haplotyper.Execute(data.chrom, varnames[item], sorted(positions), data.getFamSamples(item), maker.rsq, tmp_log_output)
{% endraw %} {% raw %}
data.getFamSamples(item)
[['4_364', '4_364_1', '0', '0', '1', '00', '00'],
 ['4_364', '4_364_17', '0', '0', '2', '00', '00'],
 ['4_364', '4_364_14', '0', '0', '1', '00', '00'],
 ['4_364', '4_364_13', '0', '0', '1', '00', '00'],
 ['4_364', '4_364_21', '0', '0', '1', '00', '00'],
 ['4_364', '4_364_2', '0', '0', '2', '00', '00'],
 ['4_364', '4_364_7', '4_364_1', '4_364_2', '1', '00', '11'],
 ['4_364', '4_364_6', '4_364_1', '4_364_2', '2', '11', '11'],
 ['4_364', '4_364_5', '4_364_1', '4_364_2', '2', '11', '11'],
 ['4_364', '4_364_20', '4_364_21', '4_364_5', '2', '00', '00'],
 ['4_364', '4_364_22', '4_364_21', '4_364_5', '1', '00', '00'],
 ['4_364', '4_364_99', '4_364_1', '4_364_2', '2', '12', '12'],
 ['4_364', '4_364_9', '4_364_13', '4_364_99', '2', '11', '11'],
 ['4_364', '4_364_12', '4_364_13', '4_364_99', '2', '00', '00'],
 ['4_364', '4_364_11', '4_364_13', '4_364_99', '1', '00', '00'],
 ['4_364', '4_364_10', '4_364_13', '4_364_99', '1', '00', '00'],
 ['4_364', '4_364_8', '4_364_13', '4_364_99', '2', '00', '00'],
 ['4_364', '4_364_16', '4_364_14', '4_364_8', '1', '00', '00'],
 ['4_364', '4_364_18', '4_364_16', '4_364_17', '1', '00', '00'],
 ['4_364', '4_364_19', '4_364_16', '4_364_17', '2', '00', '00'],
 ['4_364', '4_364_15', '4_364_14', '4_364_8', '2', '00', '00'],
 ['4_364', '4_364_3', '4_364_1', '4_364_2', '1', '00', '00'],
 ['4_364', '4_364_4', '4_364_1', '4_364_2', '2', '00', '00'],
 ['4_364', '4_364_23', '0', '0', '1', '00', '00'],
 ['4_364', '4_364_DCH23.4', '4_364_23', '4_364_4', '1', '00', '00']]
{% endraw %} {% raw %}
self.markers = ["V{}-{}".format(idx, item[1]) for idx, item in enumerate(data.variants)]
for item in data.families:
    varnames[item], positions, vcf_mafs = data.getFamVariants(item, style = "map")
    if len(varnames[item]) == 0:
        for person in data.families[item]:
            data[person] = self.missings
        continue
    if env.debug:
        with env.lock:
            sys.stderr.write('\n'.join(['\t'.join(x) for x in data.getFamSamples(item)]) + '\n\n')
    # haplotyping
    with env.lock:
        if not env.prephased:
            #with stdoutRedirect(to = env.tmp_log + str(os.getpid()) + '.log'):
            #    haplotypes[item] = self.haplotyper.Execute(data.chrom, varnames[item],
            #                                           sorted(positions), data.getFamSamples(item))[0]
            tmp_log_output=env.tmp_log + str(os.getpid())
            #with stdoutRedirect(to = tmp_log_output + '.log'):
            haplotypes[item] = self.haplotyper.Execute(data.chrom, varnames[item], sorted(positions),
                                                           data.getFamSamples(item), self.rsq, tmp_log_output)[0]

        else:
            haplotypes[item] = self.__PedToHaplotype(data.getFamSamples(item))
    if len(haplotypes[item]) == 0:
        # C++ haplotyping implementation failed
        with env.chperror_counter.get_lock():
            env.chperror_counter.value += 1
    # either use privided MAF or computer MAF
    if all(vcf_mafs):
        for idx, v in enumerate(varnames[item]):
            if v not in mafs:
                mafs[v] = vcf_mafs[idx]
    else:
        # count founder alleles
        for hap in haplotypes[item]:
            if not data.tfam.is_founder(hap[1]):
                continue
            for idxv, v in enumerate(varnames[item]):
                if v not in mafs:
                    # [#alt, #haplotypes]
                    mafs[v] = [0, 0]
                gt = hap[2 + idxv][1] if hap[2 + idxv][0].isupper() else hap[2 + idxv][0]
                if not gt == "?":
                    mafs[v][0] += self.gtconv[gt]
                    mafs[v][1] += 1.0
#
{% endraw %} {% raw %}
maker._MarkerMaker__Haplotype(data, haplotypes, mafs, varnames)
{% endraw %}

Test3

{% raw %}
    if env.triallelic_counter.value:
        env.log('{:,d} tri-allelic loci were ignored'.format(env.triallelic_counter.value))
    if env.commonvar_counter.value:
        env.log('{:,d} variants ignored due to having MAF > {} and other specified constraints'.\
                format(env.commonvar_counter.value, args.maf_cutoff))
    if env.null_counter.value:
        env.log('{:,d} units ignored due to absence in VCF file'.format(env.null_counter.value))
    if env.trivial_counter.value:
        env.log('{:,d} units ignored due to absence of variation in samples'.format(env.trivial_counter.value))
MESSAGE: 1 units ignored due to absence in VCF file
{% endraw %} {% raw %}
fatal_errors = 0
try:
    # Error msg from C++ extension
    os.system("cat {}/*.* > {}".format(env.tmp_dir, env.tmp_log))
    fatal_errors = wordCount(env.tmp_log)['fatal']
except KeyError:
    pass
if env.chperror_counter.value:
    env.error("{:,d} regional markers failed to be generated due to haplotyping failures!".\
              format(env.chperror_counter.value))
if fatal_errors:
    env.error("{:,d} or more regional markers failed to be generated due to runtime errors!".\
              format(fatal_errors))
env.log('Archiving regional marker data to directory [{}]'.format(env.cache_dir))
cache.write(arcroot = 'CACHE', source_dir = env.tmp_cache)
env.jobs = args.jobs
cat: './seqlinkage-example/tmprst/SEQLinkage_tmp_wk92qcpq/*.*': No such file or directory
MESSAGE: Archiving regional marker data to directory [./seqlinkage-example/cache]
{% endraw %} {% raw %}
env.tmp_cache
'./seqlinkage-example/tmprst/SEQLinkage_tmp_wk92qcpq/CACHE'
{% endraw %} {% raw %}
os.listdir(env.tmp_cache)
['tsq20211129.chr1.freq', 'tsq20211129.chr1.tped', 'tsq20211129.tfam']
{% endraw %} {% raw %}
env.jobs
16
{% endraw %} {% raw %}
tpeds = [os.path.join(env.tmp_cache, item) for item in os.listdir(env.tmp_cache) if item.startswith(env.output) and item.endswith('.tped')]
for fmt in args.format:
    cache.setID(fmt)
    if not args.vanilla and cache.check():
        env.log('Loading {} data from archive ...'.format(fmt.upper()))
        cache.load(target_dir = env.tmp_dir, names = [fmt.upper()])
    else:
        env.log('{:,d} units will be converted to {} format'.format(env.success_counter.value, fmt.upper()))
        env.format_counter.value = 0
        format(tpeds, os.path.join(env.tmp_cache, "{}.tfam".format(env.output)),
               args.prevalence, args.wild_pen, args.muta_pen, fmt,
               args.inherit_mode, args.theta_max, args.theta_inc)
        env.log('{:,d} units successfully converted to {} format\n'.\
                format(env.format_counter.value, fmt.upper()), flush = True)
        if env.skipped_counter.value:
            # FIXME: perhaps we need to rephrase this message?
            env.log('{} region - family pairs skipped'.\
                    format(env.skipped_counter.value))
        env.log('Archiving {} format to directory [{}]'.format(fmt.upper(), env.cache_dir))
        cache.write(arcroot = fmt.upper(),
                    source_dir = os.path.join(env.tmp_dir, fmt.upper()), mode = 'a')
mkpath(env.outdir)
MESSAGE: 2 units will be converted to MERLIN format
16
16
MESSAGE: 1 units successfully converted to MERLIN format
MESSAGE: Archiving MERLIN format to directory [./seqlinkage-example/cache]
MESSAGE: 2 units will be converted to LINKAGE format
MESSAGE: 1 units successfully converted to LINKAGE format
MESSAGE: Archiving LINKAGE format to directory [./seqlinkage-example/cache]
{% endraw %}

2. Testing run_linkage

{% raw %}
args.run_linkage = True
{% endraw %} {% raw %}
cache.setID('analysis')
{% endraw %} {% raw %}
env.output
'LINKAGE'
{% endraw %} {% raw %}
?cache.load
Signature: cache.load(target_dir=None, names=None)
Docstring: <no docstring>
File:      /mnt/mfs/statgen/yin/Github/linkage/SEQpy3/SEQLinkage/Utils.py
Type:      method
{% endraw %} {% raw %}
cache.cache_name
'/mnt/mfs/statgen/yin/Github/linkage/SEQpy3/cache/LINKAGE.cache'
{% endraw %} {% raw %}
not args.vanilla and cache.check()
False
{% endraw %} {% raw %}
fmt = args.format[0]
{% endraw %} {% raw %}
args.blueprint
'data/genemap.hg38.txt'
{% endraw %} {% raw %}
args.theta_inc
0.05
{% endraw %} {% raw %}
args.theta_max
0.5
{% endraw %} {% raw %}
args.output_limit
10
{% endraw %} {% raw %}
if args.run_linkage:
    cache.setID('analysis')
    if not args.vanilla and cache.check():
        env.log('Loading linkage analysis result from archive ...'.format(fmt.upper()))
        cache.load(target_dir = env.output, names = ['heatmap'])
    else:
        env.log('Running linkage analysis ...'.format(fmt.upper()))
        run_linkage(args.blueprint, args.theta_inc, args.theta_max, args.output_limit)
        env.log('Linkage analysis succesfully performed for {:,d} units\n'.\
                format(env.run_counter.value, fmt.upper()), flush = True)
        if env.makeped_counter.value:
            env.log('{} "makeped" runtime errors occurred'.format(env.makeped_counter.value))
        if env.pedcheck_counter.value:
            env.log('{} "pedcheck" runtime errors occurred'.format(env.pedcheck_counter.value))
        if env.unknown_counter.value:
            env.log('{} "unknown" runtime errors occurred'.format(env.unknown_counter.value))
        if env.mlink_counter.value:
            env.log('{} "mlink" runtime errors occurred'.format(env.mlink_counter.value))
        cache.write(arcroot = 'heatmap', source_dir = os.path.join(env.output, 'heatmap'), mode = 'a')
    html(args.theta_inc, args.theta_max, args.output_limit)
else:
    env.log('Saving data to [{}]'.format(os.path.abspath(env.output)))
    cache.load(target_dir = env.output, names = [fmt.upper() for fmt in args.format])
MESSAGE: Running linkage analysis ...
MESSAGE: Linkage analysis succesfully performed for 1 units
MESSAGE: 2 "pedcheck" runtime errors occurred
MESSAGE: Report for [tsq20211129] is generated in HTML format
{% endraw %} {% raw %}
    env.log('Saving data to [{}]'.format(os.path.abspath(env.output)))
    cache.load(target_dir = env.output, names = [fmt.upper() for fmt in args.format])
MESSAGE: Saving data to [/mnt/mfs/statgen/yin/Github/linkage/SEQpy2/testseqlink]
{% endraw %}

Testing main

{% raw %}
if args.no_save:
    cache = NoCache()
else:
    cache = Cache(env.cache_dir, env.output, vars(args))
cache.setID('vcf')
# STEP 1: write encoded data to TPED format
if not args.vanilla and cache.check():
    env.log('Loading regional marker data from archive ...')
    cache.load(target_dir = env.tmp_dir, names = ['CACHE'])
    env.success_counter.value = sum(map(fileLinesCount, glob.glob('{}/*.tped'.format(env.tmp_cache))))
    env.batch = 10
else:
    # load VCF file header
    checkVCFBundle(args.vcf)
    cache.clear()
    try:
        vs = cstatgen.VCFstream(args.vcf)
    except Exception as e:
        env.error("{}".format(e), exit = True)
    samples_vcf = vs.GetSampleNames()
    if len(samples_vcf) == 0:
        env.error("Fail to extract samples from [{}]".format(args.vcf), exit = True)
    env.log('{:,d} samples found in [{}]'.format(len(samples_vcf), args.vcf))
    samples_not_vcf = checkSamples(samples_vcf, getColumn(args.tfam, 2))[1]
    # load sample info
    data = RData(samples_vcf, TFAMParser(args.tfam))
    if len(data.families) == 0:
        env.error('No valid family to process. ' \
                  'Families have to be at least trio with at least one member in VCF file.', exit = True)
    if len(data.samples) == 0:
        env.error('No valid sample to process. ' \
                  'Samples have to be in families, and present in both TFAM and VCF files.', exit = True)
    rewriteFamfile(os.path.join(env.tmp_cache, '{}.tfam'.format(env.output)),
                   data.tfam.samples, list(data.samples.keys()) + samples_not_vcf)
    if args.single_markers:
        regions = [(x[0], x[1], x[1], "{}:{}".format(x[0], x[1]), '.', '.', '.')
                   for x in vs.GetGenomeCoordinates()]
        args.blueprint = None
    else:
        # load blueprint
        try:
            env.log('Loading marker map from [{}] ...'.format(args.blueprint))
            with open(args.blueprint, 'r') as f:
                regions = [x.strip().split() for x in f.readlines()]
        except IOError:
            env.error("Cannot load regional marker blueprint [{}]. ".format(args.blueprint), exit = True)
    env.log('{:,d} families with a total of {:,d} samples will be scanned for {:,d} pre-defined units'.\
            format(len(data.families), len(data.samples), len(regions)))
    env.jobs = max(min(args.jobs, len(regions)), 1)
    regions.extend([None] * env.jobs)
    queue = Queue()
    try:
        faulthandler.enable(file=open(env.tmp_log + '.SEGV', 'w'))
        for i in regions:
            queue.put(i)
        jobs = [EncoderWorker(
            queue, len(regions), deepcopy(data),
            RegionExtractor(args.vcf, chr_prefix = args.chr_prefix, allele_freq_info = args.freq),
            MarkerMaker(args.bin, maf_cutoff = args.maf_cutoff),
            LinkageWriter(len(samples_not_vcf))
            ) for i in range(env.jobs)]
        for j in jobs:
            j.start()
        for j in jobs:
            j.join()
        faulthandler.disable()
    except KeyboardInterrupt:
        # FIXME: need to properly close all jobs
        raise ValueError("Use 'killall {}' to properly terminate all processes!".format(env.prog))
    else:
        env.log('{:,d} units (from {:,d} variants) processed; '\
            '{:,d} Mendelian inconsistencies and {:,d} recombination events handled\n'.\
            format(env.success_counter.value,
                   env.variants_counter.value,
                   env.mendelerror_counter.value,
                   env.recomb_counter.value), flush = True)
        if env.triallelic_counter.value:
            env.log('{:,d} tri-allelic loci were ignored'.format(env.triallelic_counter.value))
        if env.commonvar_counter.value:
            env.log('{:,d} variants ignored due to having MAF > {}'.\
                    format(env.commonvar_counter.value, args.maf_cutoff))
        if env.null_counter.value:
            env.log('{:,d} units ignored due to absence in VCF file'.format(env.null_counter.value))
        if env.trivial_counter.value:
            env.log('{:,d} units ignored due to absence of variation in samples'.format(env.trivial_counter.value))
        fatal_errors = 0
        try:
            # Error msg from C++ extension
            os.system("cat {}/*.* > {}".format(env.tmp_dir, env.tmp_log))
            fatal_errors = wordCount(env.tmp_log)['fatal']
        except KeyError:
            pass
        if env.chperror_counter.value:
            env.error("{:,d} regional markers failed to be generated due to haplotyping failures!".\
                      format(env.chperror_counter.value))
        if fatal_errors:
            env.error("{:,d} or more regional markers failed to be generated due to runtime errors!".\
                      format(fatal_errors))
        env.log('Archiving regional marker data to directory [{}]'.format(env.cache_dir))
        cache.write(arcroot = 'CACHE', source_dir = env.tmp_cache)
env.jobs = args.jobs
# STEP 2: write to PLINK or mega2 format
tpeds = [os.path.join(env.tmp_cache, item) for item in os.listdir(env.tmp_cache) if item.startswith(env.output) and item.endswith('.tped')]
for fmt in args.format:
    print(fmt.lower())
    cache.setID(fmt.lower())
    if not args.vanilla and cache.check():
        env.log('Loading {} data from archive ...'.format(fmt.upper()))
        cache.load(target_dir = env.tmp_dir, names = [fmt.upper()])
    else:
        env.log('{:,d} units will be converted to {} format'.format(env.success_counter.value, fmt.upper()))
        env.format_counter.value = 0
        format(tpeds, os.path.join(env.tmp_cache, "{}.tfam".format(env.output)),
               args.prevalence, args.wild_pen, args.muta_pen, fmt,
               args.inherit_mode, args.theta_max, args.theta_inc)
        env.log('{:,d} units successfully converted to {} format\n'.\
                format(env.format_counter.value, fmt.upper()), flush = True)
        if env.skipped_counter.value:
            # FIXME: perhaps we need to rephrase this message?
            env.log('{} region - family pairs skipped'.\
                    format(env.skipped_counter.value))
        env.log('Archiving {} format to directory [{}]'.format(fmt.upper(), env.cache_dir))
        cache.write(arcroot = fmt.upper(),
                    source_dir = os.path.join(env.tmp_dir, fmt.upper()), mode = 'a')
mkpath(env.outdir)
if args.run_linkage:
    cache.setID('analysis')
    if not args.vanilla and cache.check():
        env.log('Loading linkage analysis result from archive ...'.format(fmt.upper()))
        cache.load(target_dir = env.output, names = ['heatmap'])
    else:
        env.log('Running linkage analysis ...'.format(fmt.upper()))
        run_linkage(args.blueprint, args.theta_inc, args.theta_max, args.output_limit)
        env.log('Linkage analysis succesfully performed for {:,d} units\n'.\
                format(env.run_counter.value, fmt.upper()), flush = True)
        if env.makeped_counter.value:
            env.log('{} "makeped" runtime errors occurred'.format(env.makeped_counter.value))
        if env.pedcheck_counter.value:
            env.log('{} "pedcheck" runtime errors occurred'.format(env.pedcheck_counter.value))
        if env.unknown_counter.value:
            env.log('{} "unknown" runtime errors occurred'.format(env.unknown_counter.value))
        if env.mlink_counter.value:
            env.log('{} "mlink" runtime errors occurred'.format(env.mlink_counter.value))
        cache.write(arcroot = 'heatmap', source_dir = os.path.join(env.output, 'heatmap'), mode = 'a')
    html(args.theta_inc, args.theta_max, args.output_limit)
else:
    env.log('Saving data to [{}]'.format(os.path.abspath(env.output)))
    cache.load(target_dir = env.output)
MESSAGE: 3,479 samples found in [/mnt/mfs/statgen/yin/Github/linkage/SEQpy3/data/first1000snp_full_samples.vcf.gz]
MESSAGE: 7 samples found in FAM file but not in VCF file:
28_9_101, 28_9_100, 28_9_186, 1036_2, 22_1_20, 1036_1, 22_1_10
MESSAGE: 3,461 samples in VCF file will be ignored due to absence in FAM file
MESSAGE: Loading marker map from [data/genemap.hg38.txt] ...
MESSAGE: 3 families with a total of 18 samples will be scanned for 28,325 pre-defined units
MESSAGE: 0 units processed {5.65%} ...                                       
in Haplotype
MESSAGE: 0 units processed {19.76%} ...                                      
Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V12-13302 V17-13417 V22-13687 

V8-13273: 0 0.707106 0.292894 
total familyCount:1
V12-13302: 0 0.75 0.25 
total familyCount:1
V17-13417: 0 0.75 0.25 
total familyCount:1
V22-13687: 0 0.75 0.25 
total familyCount:1
in Haplotype
MESSAGE: 0 units processed {14.12%} ...                                      
in Haplotype
MESSAGE: 0 units processed {11.29%} ...                                      


running familyrunning family running family1036  
10361036
running familyrunning familyrunning family
 22_1 
22_1 running family
 running family22_1Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V12-13302 V17-13417 V22-13687 

V8-13273: 0 0.707106 0.292894 
total familyCount:1
V12-13302: 0 0.75 0.25 
total familyCount:1
V17-13417: 0 0.75 0.25 
total familyCount:1
V22-13687: 0 0.75 0.25 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V4-17385 V8-17407 

V4-17385: 0 1 
total familyCount:1
V8-17407: 0 0.5 0.5 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V15-13380 V17-13417 

V8-13273: 0 0.744757 0.255243 
total familyCount:1
V15-13380: 0 0.744757 0.255243 
total familyCount:1
V17-13417: 0 0.744757 0.255243 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V4-17385 V5-17398 V8-17407 

V4-17385: 0 0.739454 0.260546 
total familyCount:1
V5-17398: 0 0.744757 0.255243 
total familyCount:1
V8-17407: 0 0.744757 0.255243 
total familyCount:1

28_9running family
 28_9 28_9

MESSAGE: 3 units processed {36.70%} ...                                      
Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V15-13380 V17-13417 

V8-13273: 0 0.744757 0.255243 
total familyCount:1
V15-13380: 0 0.744757 0.255243 
total familyCount:1
V17-13417: 0 0.744757 0.255243 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V3-17379 V4-17385 V9-17408 

V3-17379: 0 0.824706 0.175294 
total familyCount:1
V4-17385: 0 0.5 0.5 
total familyCount:1
V9-17408: 0 0.824706 0.175294 
total familyCount:1
MESSAGE: 3 units processed {73.39%} ...                                      
in Haplotype
running family 1036
running family 22_1
MESSAGE: 3 units processed {79.04%} ...
running family 
MESSAGE: 3 units processed {81.86%} ...
28_9
MESSAGE: 3 units processed {81.86%} ...

MESSAGE: 4 units processed {93.15%} ...                                      
Estimating allele frequencies... [using maximum likelihood]0
   V3-14464 V8-14653 V18-14907 V19-14930 V20-14933 V40-16103 V50-16378 
   V55-16487 V69-17147 V79-17358 V85-17385 V89-17407 V124-17697 V151-17928 
   V152-17929 V159-20184 V160-20191 V162-20212 V165-20227 V169-20235 
   V171-20250 V178-20316 V180-20485 V182-20522 V184-20547 V194-29368 

V3-14464: 0 0.75 0.25 
total familyCount:1
V8-14653: 0 0.75 0.25 
total familyCount:1
V18-14907: 0 0.5 0.5 
total familyCount:1
V19-14930: 0 0.5 0.5 
total familyCount:1
V20-14933: 0 1 
total familyCount:1
V40-16103: 0 0.707106 0.292894 
total familyCount:1
V50-16378: 0 0.5 0.5 
total familyCount:1
V55-16487: 0 0.75 0.25 
total familyCount:1
V69-17147: 0 0.5 0.5 
total familyCount:1
V79-17358: 0 1 
total familyCount:1
V85-17385: 0 1 
total familyCount:1
V89-17407: 0 0.5 0.5 
total familyCount:1
V124-17697: 0 0.5 0.5 
total familyCount:1
V151-17928: 0 0.5 0.5 
total familyCount:1
V152-17929: 0 0.5 0.5 
total familyCount:1
V159-20184: 0 0.5 0.5 
total familyCount:1
V160-20191: 0 1 
total familyCount:1
V162-20212: 0 1 
total familyCount:1
V165-20227: 0 1 
total familyCount:1
V169-20235: 0 1 
total familyCount:1
V171-20250: 0 1 
total familyCount:1
V178-20316: 0 1 
total familyCount:1
V180-20485: 0 1 
total familyCount:1
V182-20522: 0 1 
total familyCount:1
V184-20547: 0 0.75 0.25 
total familyCount:1
V194-29368: 0 1 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V3-14464 V8-14653 V10-14677 V18-14907 V19-14930 V40-16103 V50-16378 
   V55-16487 V69-17147 V80-17365 V85-17385 V86-17398 V89-17407 V95-17479 
   V108-17559 V112-17589 V124-17697 V129-17722 V131-17746 V142-17829 
   V155-19190 V159-20184 V160-20191 V162-20212 V165-20227 V169-20235 
   V171-20250 V178-20316 V184-20547 

V3-14464: 0 0.739454 0.260546 
total familyCount:1
V8-14653: 0 0.707103 0.292897 
total familyCount:1
V10-14677: 0 0.739454 0.260546 
total familyCount:1
V18-14907: 0 0.707103 0.292897 
total familyCount:1
V19-14930: 0 0.728713 0.271287 
total familyCount:1
V40-16103: 0 0.5 0.5 
total familyCount:1
V50-16378: 0 0.5 0.5 
total familyCount:1
V55-16487: 0 0.744757 0.255243 
total familyCount:1
V69-17147: 0 0.744757 0.255243 
total familyCount:1
V80-17365: 0 0.744757 0.255243 
total familyCount:1
V85-17385: 0 0.739454 0.260546 
total familyCount:1
V86-17398: 0 0.744757 0.255243 
total familyCount:1
V89-17407: 0 0.744757 0.255243 
total familyCount:1
V95-17479: 0 0.744757 0.255243 
total familyCount:1
V108-17559: 0 0.744757 0.255243 
total familyCount:1
V112-17589: 0 0.744757 0.255243 
total familyCount:1
V124-17697: 0 0.739454 0.260546 
total familyCount:1
V129-17722: 0 0.744757 0.255243 
total familyCount:1
V131-17746: 0 0.739454 0.260546 
total familyCount:1
V142-17829: 0 0.744757 0.255243 
total familyCount:1
V155-19190: 0 0.739454 0.260546 
total familyCount:1
V159-20184: 0 0.744757 0.255243 
total familyCount:1
V160-20191: 0 0.744757 0.255243 
total familyCount:1
V162-20212: 0 0.744757 0.255243 
total familyCount:1
V165-20227: 0 0.744757 0.255243 
total familyCount:1
V169-20235: 0 0.744757 0.255243 
total familyCount:1
V171-20250: 0 0.739454 0.260546 
total familyCount:1
V178-20316: 0 0.739454 0.260546 
total familyCount:1
V184-20547: 0 0.739454 0.260546 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V3-14464 V5-14470 V8-14653 V10-14677 V18-14907 V19-14930 V34-16068 
   V40-16103 V50-16378 V84-17379 V85-17385 V90-17408 V105-17519 V107-17556 
   V116-17614 V155-19190 V158-20166 V160-20191 V162-20212 V165-20227 
   V166-20227 V168-20231 V169-20235 V171-20250 V174-20254 V178-20316 
   V183-20545 V184-20547 V194-29368 

V3-14464: 0 0.646634 0.353366 
total familyCount:1
V5-14470: 0 0.824706 0.175294 
total familyCount:1
V8-14653: 0 0.452863 0.547137 
total familyCount:1
V10-14677: 0 0.656432 0.343568 
total familyCount:1
V18-14907: 0 0.5 0.5 
total familyCount:1
V19-14930: 0 0.628665 0.371335 
total familyCount:1
V34-16068: 0 0.5 0.5 
total familyCount:1
V40-16103: 0 0.628667 0.371333 
total familyCount:1
V50-16378: 0 0.617751 0.382249 
total familyCount:1
V84-17379: 0 0.824706 0.175294 
total familyCount:1
V85-17385: 0 0.5 0.5 
total familyCount:1
V90-17408: 0 0.824706 0.175294 
total familyCount:1
V105-17519: 0 0.824706 0.175294 
total familyCount:1
V107-17556: 0 0.820231 0.179769 
total familyCount:1
V116-17614: 0 0.824706 0.175294 
total familyCount:1
V155-19190: 0 0.6 0.4 
total familyCount:1
V158-20166: 0 0.628665 0.371335 
total familyCount:1
V160-20191: 0 0.628665 0.371335 
total familyCount:1
V162-20212: 0 0.628665 0.371335 
total familyCount:1
V165-20227: 0 0.656432 0.343568 
total familyCount:1
V166-20227: 0 0.646634 0.353366 
total familyCount:1
V168-20231: 0 0.811255 0.188745 
total familyCount:1
V169-20235: 0 0.824706 0.175294 
total familyCount:1
V171-20250: 0 0.628665 0.371335 
total familyCount:1
V174-20254: 0 0.824706 0.175294 
total familyCount:1
V178-20316: 0 0.628665 0.371335 
total familyCount:1
V183-20545: 0 0.820231 0.179769 
total familyCount:1
V184-20547: 0 0.617751 0.382249 
total familyCount:1
V194-29368: 0 0.656432 0.343568 
total familyCount:1
                                                                      
MESSAGE: 4 units processed {99.94%} ...                                      
Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V12-13302 V17-13417 V22-13687 

V8-13273: 0 0.707106 0.292894 
total familyCount:1
V12-13302: 0 0.75 0.25 
total familyCount:1
V17-13417: 0 0.75 0.25 
total familyCount:1
V22-13687: 0 0.75 0.25 
total familyCount:1
Estimating allele frequencies... [using maximum likelihood]0
   V8-13273 V15-13380 V17-13417 

V8-13273: 0 0.744757 0.255243 
total familyCount:1
V15-13380: 0 0.744757 0.255243 
total familyCount:1
V17-13417: 0 0.744757 0.255243 
total familyCount:1
MESSAGE: 4 units (from 256 variants) processed; 24 Mendelian inconsistencies and 116 recombination events handled
MESSAGE: 28,321 units ignored due to absence in VCF file
MESSAGE: Archiving regional marker data to directory [/mnt/mfs/statgen/yin/Github/linkage/SEQpy3/cache]
merlin
16
MESSAGE: 4 units will be converted to MERLIN format
MESSAGE: 4 units successfully converted to MERLIN format
MESSAGE: Archiving MERLIN format to directory [/mnt/mfs/statgen/yin/Github/linkage/SEQpy3/cache]
MESSAGE: Saving data to [/mnt/mfs/statgen/yin/Github/linkage/SEQpy3/LINKAGE]
{% endraw %} {% raw %}
1
{% endraw %} {% raw %}
args.run_linkage = True
{% endraw %} {% raw %}
region = ['1', '69090', '70008', 'OR4F5', '4.866641545668504e-06', '6.181823219621424e-06', '3.6135725636621673e-06']
{% endraw %} {% raw %}
extractor.getRegion(region)
maker.getRegion(region)
writer.getRegion(region)
{% endraw %} {% raw %}
haplotypes = OrderedDict()
mafs = {}   ##Per fam per variant
uniq_vars = []
exclude_vars = []
varnames = {}
recombPos = {}
{% endraw %} {% raw %}
extractor.apply(data)
{% endraw %} {% raw %}
maker._MarkerMaker__Haplotype(data, haplotypes, mafs, varnames,recombPos,uniq_vars,exclude_vars)
{% endraw %} {% raw %}
haplotypes['668']
{% endraw %} {% raw %}
maker._MarkerMaker__ClusterByLD(data, haplotypes, varnames)
{% endraw %} {% raw %}
maker._MarkerMaker__CodeHaplotypes(data, haplotypes, mafs, varnames, [])
{% endraw %} {% raw %}
clusters = []
if clusters is not None:
    clusters_idx = [[[varnames[item].index(x) for x in y] for y in clusters] for item in haplotypes]
else:
    clusters_idx = [[[]] for item in haplotypes]
maker.coder.Execute(haplotypes.values(), [[mafs[v] for v in varnames[item]] for item in haplotypes], clusters_idx)
{% endraw %} {% raw %}
maker.coder.Print()
{% endraw %} {% raw %}
maker.ld
{% endraw %} {% raw %}
[[mafs[item][v] for v in varnames[item]] for item in haplotypes]
{% endraw %} {% raw %}
varnames['668']
{% endraw %} {% raw %}
mafs
{% endraw %}

Test clusterbyld

{% raw %}
data.freq
{% endraw %} {% raw %}
data.variants
{% endraw %} {% raw %}
maker.apply(data)
{% endraw %} {% raw %}
haplotypes = OrderedDict()
mafs = {}
varnames = {}
maker._MarkerMaker__Haplotype(data, haplotypes, mafs, varnames)
{% endraw %} {% raw %}
type(data)
{% endraw %} {% raw %}
markers = ["V{}-{}".format(idx, item[1]) for idx, item in enumerate(data.variants)]
{% endraw %} {% raw %}
item = list(data.families.keys())[2]
{% endraw %} {% raw %}
item ='1036'
{% endraw %} {% raw %}
varnames = {}
{% endraw %} {% raw %}
varnames[item], positions, vcf_mafs = data.getFamVariants(item, style = "map")
{% endraw %} {% raw %}
varnames
{% endraw %} {% raw %}
item
{% endraw %} {% raw %}
varnames
{% endraw %} {% raw %}
tmp
{% endraw %} {% raw %}
maker.haplotyper.Execute(data.chrom, varnames[item], sorted(positions), data.getFamSamples(item))[0]
{% endraw %} {% raw %}
recombPos={}
{% endraw %} {% raw %}
varnames[item], positions, vcf_mafs = data.getFamVariants(item, style = "map")
recombPos[item]={}
var_for_haplotype=[]
positions_for_haplotype=[]
output_sample=[]
{% endraw %} {% raw %}
data.gnomAD_estimate.keys()
{% endraw %} {% raw %}
data.freq
{% endraw %} {% raw %}
positions
{% endraw %} {% raw %}
var_for_haplotype=varnames[item]
positions_for_haplotype=positions
{% endraw %} {% raw %}
item
{% endraw %} {% raw %}
famid =item
{% endraw %} {% raw %}
sorted_names = []
S_no_parents = filter(lambda x: True if data.tfam.is_founder(x) else False, data.tfam.families[famid])
graph = data.tfam.graph[famid].copy()
{% endraw %} {% raw %}
list(S_no_parents)
{% endraw %} {% raw %}
data.tfam.families[famid]
{% endraw %} {% raw %}
graph
{% endraw %} {% raw %}
while(S_no_parents):
    n = S_no_parents.pop()
    sorted_names.append(n)
    if n not in graph:
        continue
    offsprings = graph.pop(n)
    for m in offsprings:
        father, mother = data.tfam.get_parents(m)
        if father not in graph and mother not in graph:
            S_no_parents.append(m)
if graph:
    raise ValueError("There is a loop in the pedigree: {}\n".format(' '.join(graph.keys())))
else:
    return sorted_names
{% endraw %} {% raw %}
data.tfam
{% endraw %} {% raw %}
for person in data.tfam.sort_family(item):
    output_sample.append([])
    last_ele=len(output_sample)-1
    output_sample[last_ele] = data.tfam.samples[person][:-1]
    if person in data.samples:
        for marker in var_for_haplotype:
            idx=int(marker.split('-')[0][1:])
            output_sample[last_ele].append(data.genotype_all[person][idx])
    else:
        output_sample[last_ele].extend(["00"] * len(var_for_haplotype))
{% endraw %} {% raw %}
len(data.tfam.sort_family(item))
{% endraw %} {% raw %}
set(output_sample[0][5:])
{% endraw %} {% raw %}
env.tmp_log
{% endraw %} {% raw %}
haplotypes = {}
{% endraw %} {% raw %}
tmp_log_output=env.tmp_log + str(os.getpid())
haplotypes[item] = maker.haplotyper.Execute(data.chrom, var_for_haplotype, positions_for_haplotype, output_sample, maker.rsq, tmp_log_output)[0]
{% endraw %} {% raw %}
1
{% endraw %} {% raw %}
var_for_haplotype
{% endraw %} {% raw %}
positions_for_haplotype
{% endraw %} {% raw %}
haplotypes
{% endraw %} {% raw %}
1+1
{% endraw %} {% raw %}
haplotypes
{% endraw %} {% raw %}
str(os.getpid())
{% endraw %} {% raw %}
var_for_haplotype
{% endraw %} {% raw %}
positions_for_haplotype
{% endraw %} {% raw %}
output_sample
{% endraw %} {% raw %}
maker.rsq
{% endraw %} {% raw %}
haplotypes['1']
{% endraw %} {% raw %}
for hap_idx,haploid in enumerate(haplotypes[item]):
    for vidx,var in enumerate(haploid[2:]):
        if not var.endswith(':') and not var.endswith('|') and vidx!=0:
            postvar_name=varnames[item][vidx]
            prevar_name=varnames[item][vidx-1]
            recomb_pair = (prevar_name,postvar_name)
            print('run this')
            try:
                recombPos[item][recomb_pair].append(hap_idx)
            except:
                recombPos[item][recomb_pair]=[hap_idx]
{% endraw %} {% raw %}
haploid
{% endraw %} {% raw %}
var
{% endraw %} {% raw %}
haplotypes['1']
{% endraw %} {% raw %}
mafs
{% endraw %} {% raw %}
varnames
{% endraw %} {% raw %}
recombPos
{% endraw %} {% raw %}
uniq_vars
{% endraw %} {% raw %}
exclude_vars
{% endraw %} {% raw %}
maker.rsq
{% endraw %} {% raw %}
person
{% endraw %} {% raw %}
mafs
{% endraw %} {% raw %}
mafs = {}
# either use privided MAF or computer MAF
if all(vcf_mafs):
    print('run this')
    for idx, v in enumerate(varnames[item]):
        if v not in mafs:
            mafs[v] = vcf_mafs[idx]
else:
    # count founder alleles
    for hap in haplotypes[item]:
        if not data.tfam.is_founder(hap[1]):
            continue
        for idxv, v in enumerate(varnames[item]):
            if v not in mafs:
                # [#alt, #haplotypes]
                mafs[v] = [0, 0]
            gt = hap[2 + idxv][1] if hap[2 + idxv][0].isupper() else hap[2 + idxv][0]
            if not gt == "?":
                mafs[v][0] += self.gtconv[gt]
                mafs[v][1] += 1.0
{% endraw %} {% raw %}
mafs
{% endraw %} {% raw %}
vcf_mafs
{% endraw %} {% raw %}
type(mafs['V0-176659933'])
{% endraw %} {% raw %}
maker.maf_cutoff
{% endraw %} {% raw %}
exclude_vars = []
for v in mafs.keys():
    if mafs[v] > maker.maf_cutoff:
        exclude_vars.append(v)
for i in haplotypes.keys():
    haplotypes[i] = listit(haplotypes[i])
    for j in range(len(haplotypes[i])):
        haplotypes[i][j] = haplotypes[i][j][:2] + \
          [x for idx, x in enumerate(haplotypes[i][j][2:]) if varnames[i][idx] not in exclude_vars]
    varnames[i] = [x for x in varnames[i] if x not in exclude_vars]
    # handle trivial data
    if len(varnames[i]) == 0:
        for person in data.families[i]:
            data[person] = self.missings
        del varnames[i]
        del haplotypes[i]
{% endraw %} {% raw %}
tmp_exclude_vars=exclude_vars
{% endraw %} {% raw %}
tmp_exclude_vars
{% endraw %} {% raw %}
recombPos
{% endraw %} {% raw %}
uniq_vars = []
i = '1'
for tmp_var in varnames[i]:
    if tmp_var not in uniq_vars:
             uniq_vars.append(tmp_var)
varnames[i] = [x for x in varnames[i] if x not in tmp_exclude_vars]
{% endraw %} {% raw %}
data
{% endraw %} {% raw %}
data.genotype_all
{% endraw %} {% raw %}
varnames
{% endraw %} {% raw %}
if len(varnames):
    if not any ([len(varnames[x]) - 1 for x in varnames]):
        # all families have only one variant
        maker._MarkerMaker__AssignSNVHaplotypes(data, haplotypes, mafs, varnames)
    else:
        print('run this')
        # calculate LD clusters using founder haplotypes
        clusters = maker._MarkerMaker__ClusterByLD(data, haplotypes, varnames)
        # recoding the genotype of the region
        maker._MarkerMaker__CodeHaplotypes(data, haplotypes, mafs, varnames, clusters)
{% endraw %} {% raw %}
clusters
{% endraw %}

def __ClusterByLD(self, data, haplotypes, varnames):

{% raw %}
haplotypes
OrderedDict([('1',
              [['1', 'I:1', '1:', '1:', '1:', '1:', '1:', '1:', '1:'],
               ['1', 'I:1', '2:', '2:', '1:', '1:', '1:', '1:', '2:'],
               ['1', 'I:2', '2:', '2:', '1:', '1:', '1:', '2:', '2:'],
               ['1', 'I:2', '1:', '1:', '1:', '1:', '1:', '1:', '1:'],
               ['1', 'II:3', '2:', '2|', '1:', '1:', '1:', '2|', '2|'],
               ['1', 'II:3', '1:', '1|', '1:', '1:', '1:', '1|', '1|'],
               ['1', 'II:2', '2:', '2|', '1:', '1:', '1:', '2|', '2|'],
               ['1', 'II:2', '1:', '1|', '1:', '1:', '1:', '1|', '1|'],
               ['1', 'II:1', '2:', '2|', '1:', '1:', '1:', '2|', '2|'],
               ['1', 'II:1', '2:', '2|', '1:', '1:', '1:', '1|', '2|'],
               ['1', 'II:4', '2:', '2|', '1:', '1:', '1:', '2\\', '2|'],
               ['1', 'II:4', '1:', '1|', '1:', '1:', '1:', '1\\', '2|']]),
             ('2',
              [['2', 'I:A', '1:', '1:', '1:', '1:', '1:', '1:', '2:'],
               ['2', 'I:A', '1:', '1:', '1:', '1:', '1:', '1:', '2:'],
               ['2', 'I:B', '2:', '2:', '2:', '2:', '2:', '2:', '1:'],
               ['2', 'I:B', '1:', '1:', '1:', '1:', '1:', '1:', '1:'],
               ['2', 'II:D', '2:', '2|', '2|', '2|', '2|', '2|', '1:'],
               ['2', 'II:D', '1:', '1|', '1|', '1|', '1|', '1|', '2:'],
               ['2', 'II:C', '1:', '1|', '1|', '1|', '1|', '1|', '1:'],
               ['2', 'II:C', '1:', '1|', '1|', '1|', '1|', '1|', '2:'],
               ['2', 'II:B', '1:', '1|', '1|', '1|', '1|', '1|', '1:'],
               ['2', 'II:B', '1:', '1|', '1|', '1|', '1|', '1|', '2:'],
               ['2', 'II:A', '2:', '2|', '2|', '2|', '2|', '2|', '1:'],
               ['2', 'II:A', '1:', '1|', '1|', '1|', '1|', '1|', '2:']])])
{% endraw %} {% raw %}
maker.r2
{% endraw %} {% raw %}
markers
{% endraw %} {% raw %}
gtt = []
founder_haplotypes = []
markers = sorted(set(itertools.chain(*varnames.values())), key = lambda x: int(x.split("-")[0][1:]))
for item in haplotypes:
    for ihap, hap in enumerate(haplotypes[item]):
        if not data.tfam.is_founder(hap[1]):
            continue
        gt = [hap[2 + varnames[item].index(v)] if v in varnames[item] else '?' for v in markers]
        founder_haplotypes.append(("{}-{}".format(hap[1], ihap % 2), "".join([x[1] if x[0].isupper() else x[0] for x in gt])))
        gtt.append(["{}-{}".format(hap[1], ihap % 2), [x[1] if x[0].isupper() else x[0] for x in gt]])
{% endraw %} {% raw %}
founder_haplotypes
[('I:1-0', '1111111'),
 ('I:1-1', '2211112'),
 ('I:2-0', '2211122'),
 ('I:2-1', '1111111'),
 ('I:A-0', '1111112'),
 ('I:A-1', '1111112'),
 ('I:B-0', '2222221'),
 ('I:B-1', '1111111')]
{% endraw %} {% raw %}
egglib.stats.matrix_LD(Align.create(founder_haplotypes,egglib.Alphabet(cat='string',expl=['1','2'],miss='?')),('rsq'))
([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
 [[None],
  [1.0, None],
  [0.2380952380952381, 0.2380952380952381, None],
  [0.2380952380952381, 0.2380952380952381, 1.0, None],
  [0.2380952380952381, 0.2380952380952381, 1.0, 1.0, None],
  [0.5555555555555556,
   0.5555555555555556,
   0.42857142857142866,
   0.42857142857142866,
   0.42857142857142866,
   None],
  [0.06666666666666665,
   0.06666666666666665,
   0.14285714285714282,
   0.14285714285714282,
   0.14285714285714282,
   0.0,
   None]])
{% endraw %} {% raw %}
gtt
[['I:1-0', ['1', '1', '1', '1', '1', '1', '1']],
 ['I:1-1', ['2', '2', '1', '1', '1', '1', '2']],
 ['I:2-0', ['2', '2', '1', '1', '1', '2', '2']],
 ['I:2-1', ['1', '1', '1', '1', '1', '1', '1']],
 ['I:A-0', ['1', '1', '1', '1', '1', '1', '2']],
 ['I:A-1', ['1', '1', '1', '1', '1', '1', '2']],
 ['I:B-0', ['2', '2', '2', '2', '2', '2', '1']],
 ['I:B-1', ['1', '1', '1', '1', '1', '1', '1']]]
{% endraw %} {% raw %}
import egglib
{% endraw %} {% raw %}
gt
['1:', '1:', '1:', '1:', '1:', '1:', '1:']
{% endraw %} {% raw %}
ldi,ld = egglib.stats.matrix_LD(Align.create(gtt,egglib.Alphabet(cat='char',expl=['1','2'],miss='?')),('rsq'))
blocks = []
for j in range(len(ldi)):
    block = [j]
    for k in range(j+1,len(ldi)):
        if ld[k][j] > maker.r2:
            block.append(k)
    if len(block) > 1:
        blocks.append(block)
clusters = [[markers[idx] for idx in item] for item in list(connected_components(blocks))]
{% endraw %} {% raw %}
clusters
[['V0-89984370', 'V1-89984604'], ['V2-89984739', 'V3-89985940', 'V4-89986608']]
{% endraw %} {% raw %}
varnames
{'1': ['V0-89984370',
  'V1-89984604',
  'V2-89984739',
  'V3-89985940',
  'V4-89986608',
  'V5-89986760',
  'V6-89987201'],
 '2': ['V0-89984370',
  'V1-89984604',
  'V2-89984739',
  'V3-89985940',
  'V4-89986608',
  'V5-89986760',
  'V6-89987201']}
{% endraw %} {% raw %}
ld = Align.create(founder_haplotypes).matrixLD(validCharacters="12")["r2"]
blocks = []
for j in ld:
    block = [j]
    for k in ld[j]:
        if ld[j][k] > maker.r2:
            block.append(k)
    if len(block) > 1:
        blocks.append(block)
# get LD clusters
clusters = [[markers[idx] for idx in item] for item in list(connected_components(blocks))]
{% endraw %} {% raw %}
ld
{% endraw %} {% raw %}
block
{% endraw %} {% raw %}
list(connected_components([]))
{% endraw %} {% raw %}
list(connected_components(blocks))
{% endraw %} {% raw %}
clusters
{% endraw %}

def __CodeHaplotypes(self, data, haplotypes, mafs, varnames, clusters):

{% raw %}
if clusters is not None:
    clusters_idx = [[[varnames[item].index(x) for x in y] for y in clusters] for item in haplotypes]
else:
    clusters_idx = [[[]] for item in haplotypes]
{% endraw %} {% raw %}
clusters_idx
{% endraw %} {% raw %}
maker.coder.Execute(haplotypes.values(), [[mafs[v] for v in varnames[item]] for item in haplotypes], clusters_idx)
{% endraw %} {% raw %}
data.superMarkerCount
{% endraw %} {% raw %}
for line in maker.coder.GetHaplotypes():
    print(line)
    if not line[1] in data:
        # this sample is not in VCF file. Every variant site should be missing
        # they have to be skipped for now
        continue
    data[line[1]] = (line[2].split(','), line[3].split(','))
    if len(data[line[1]][0]) > data.superMarkerCount:
        data.superMarkerCount = len(data[line[1]][0])
        
{% endraw %} {% raw %}
data
{% endraw %} {% raw %}
data.superMarkerCount
{% endraw %} {% raw %}
for item in haplotypes:
    data.maf[item] = maker.coder.GetAlleleFrequencies(item)
    data.maf[item] = tuple(tuple(np.array(v) / np.sum(v)) if np.sum(v) else v
                      for v in data.maf[item])
{% endraw %} {% raw %}
data.maf
{% endraw %} {% raw %}
maker._MarkerMaker__FormatHaplotypes(data,recombPos,varnames,uniq_vars)
{% endraw %} {% raw %}
data
{% endraw %} {% raw %}
type(data['I:1'][0])
{% endraw %} {% raw %}
    for item in data.famvaridx:
        if item not in haplotypes and data[data.families[item][0]] != ('0','0'):
            # when only wild-type haplotypes are present in a family, still code them instead of ignoring the family
            if self.freq_by_fam:
                pop=data.freq_by_fam[item]
                try:
                    varnames[item]=data.total_varnames[pop]
                    mafs[item]=data.total_mafs[pop]
                except:
                    continue
            else:
                varnames[item]=data.total_varnames['pop']
                mafs[item]=data.total_mafs
            haplotypes[item]=[]
            for person in data.families[item]:
                tmp_person=[item, person]
                if '00' in data[person]:
                    tmp_person+=['?:']*len(varnames[item])
                else:
                    tmp_person+=['1:']*len(varnames[item])
                haplotypes[item].append(tmp_person)
                haplotypes[item].append(tmp_person)
        elif item in haplotypes:
            nonvar_hap_flag=False
            #determine if wild-type haplotype is present in a family
            for hap in haplotypes[item]:
                tmp_genes=[]
                for tmpa in hap[2:]:
                    if 'A' in tmpa or 'B' in tmpa:
                        tmp_genes.append(tmpa[1])
                    else:
                        tmp_genes.append(tmpa[0])
                if set(tmp_genes)==set(['1']):
                    #non variant haplotype
                    nonvar_hap_flag=True
                    break
            if not nonvar_hap_flag:
                #if family don't have wild-type haplotype, add a fake one to ensure correct coding
                var_num=len(varnames[item])
                fake_person=[item, 'FAKEPERSON']+['1:']*var_num
                haplotypes[item].append(fake_person)
            for hidx,hap in enumerate(haplotypes[item]):
                if hap[1] in data.missing_persons:
                    missing_person=[item,hap[1]]+['?:']*len(varnames[item])
                    haplotypes[item][hidx]=missing_person

    if not clusters is None:
        clusters_idx = [[[varnames[item].index(x) for x in y] for y in clusters] for item in haplotypes]
    else:
        clusters_idx = [[[]] for item in haplotypes]
    if env.debug:
        for item in haplotypes:
            with env.lock:
                print(varnames[item],file=sys.stderr)
                print("hap{0}\t{1}\n".format(item,haplotypes[item]),file=sys.stderr)
    self.coder.Execute(haplotypes.values(), [[mafs[item][v] for v in varnames[item]] for item in haplotypes], clusters_idx)
    if env.debug:
        with env.lock:
            if clusters:
                print("Family LD clusters: ", clusters_idx, "\n", file = sys.stderr)
            self.coder.Print()
    # line: [fid, sid, hap1, hap2]
    for line in self.coder.GetHaplotypes():
        if not line[1] in data:
            # this sample is not in VCF file. Every variant site should be missing
            # they have to be skipped for now
            continue
        data[line[1]] = (line[2].split(','), line[4].split(','))
        #sub-region count for each sample individual
        superMarkerCount=len(data[line[1]][0])
        if line[0] not in data.patterns:
            data.patterns[line[0]]=[[] for x in range(superMarkerCount)]
        for t_Marker in range(superMarkerCount):
            t_pat1=line[3].split(',')[t_Marker]
            t_pat2=line[5].split(',')[t_Marker]
            if t_pat1 not in data.patterns[line[0]][t_Marker]:
                data.patterns[line[0]][t_Marker].append(t_pat1)
            if t_pat2 not in data.patterns[line[0]][t_Marker]:
                data.patterns[line[0]][t_Marker].append(t_pat2)
        if len(data[line[1]][0]) > data.superMarkerCount:
            data.superMarkerCount = len(data[line[1]][0])
    # get MAF
    for item in data.famvaridx:
        if item not in haplotypes:
            for person in data.families[item]:
                data[person]=(['0']*data.superMarkerCount,['0']*data.superMarkerCount)
    for item in haplotypes:
        data.maf[item] = self.coder.GetAlleleFrequencies(item)
        if not len(data.maf[item][0]):
            continue
        data.varnames_by_fam[item]=varnames[item]
        wt_maf=0
        if self.freq_by_fam:
            try:
                wt_maf=data.wt_maf[data.freq_by_fam[item]]
            except:
                pass
        else:
            wt_maf=data.wt_maf['pop']
        tmp_data_maf=[]
        for v in data.maf[item]:
            if len(v)==1:
                tmp_data_maf.append((v[0],1-v[0]))
            else:
                if np.sum(v)<1:
                    tmp_ratio=sum(v[1:])/(1-wt_maf)
                    tmp_list=[wt_maf]
                    if tmp_ratio==0:
                        tmp_list.append(1-wt_maf)
                    else:
                        for tmpv in v[1:]:
                            tmp_list.append(tmpv/tmp_ratio)
                    tmp_data_maf.append(tuple(tmp_list))
                else:
                    tmp_data_maf.append(v)
        data.maf[item]=tuple(tmp_data_maf)
    if env.debug:
        with env.lock:
            print("marker freqs = ", data.maf, "\n", file = sys.stderr)
{% endraw %} {% raw %}
for item in data.families:
    varnames[item], positions, vcf_mafs = data.getFamVariants(item, style = "map")
    if len(varnames[item]) == 0:
        for person in data.families[item]:
            data[person] = self.missings
        continue
    if env.debug:
        with env.lock:
            sys.stderr.write('\n'.join(['\t'.join(x) for x in data.getFamSamples(item)]) + '\n\n')
    # haplotyping
    with env.lock:
        if not env.prephased:
            with stdoutRedirect(to = env.tmp_log + str(os.getpid()) + '.log'):
                haplotypes[item] = self.haplotyper.Execute(data.chrom, varnames[item],
                                                       sorted(positions), data.getFamSamples(item))[0]
        else:
            haplotypes[item] = self.__PedToHaplotype(data.getFamSamples(item))
    if len(haplotypes[item]) == 0:
        # C++ haplotyping implementation failed
        with env.chperror_counter.get_lock():
            env.chperror_counter.value += 1
    # either use privided MAF or computer MAF
    if all(vcf_mafs):
        for idx, v in enumerate(varnames[item]):
            if v not in mafs:
                mafs[v] = vcf_mafs[idx]
    else:
        # count founder alleles
        for hap in haplotypes[item]:
            if not data.tfam.is_founder(hap[1]):
                continue
            for idxv, v in enumerate(varnames[item]):
                if v not in mafs:
                    # [#alt, #haplotypes]
                    mafs[v] = [0, 0]
                gt = hap[2 + idxv][1] if hap[2 + idxv][0].isupper() else hap[2 + idxv][0]
                if not gt == "?":
                    mafs[v][0] += self.gtconv[gt]
                    mafs[v][1] += 1.0
{% endraw %} {% raw %}
maker.haplotyper
{% endraw %} {% raw %}
tmp1
{% endraw %} {% raw %}
aa = []
for _ in range(10):
    a = queue.get()
    print(a)
    tmp.getRegion(a)
    tmp.apply(dd)
    tmp1.apply(dd)
    #tmp2.apply(dd)
    if len(dd.variants) != 0:
        aa.append(a)
{% endraw %} {% raw %}
data1 = deepcopy(data)
{% endraw %}

_MarkerMaker__Haplotype

{% raw %}
data = dd
{% endraw %} {% raw %}
haplotypes = OrderedDict()
mafs = {}   ##Per fam per variant
uniq_vars = []
exclude_vars = []
varnames = {}
recombPos = {}
{% endraw %} {% raw %}
tmp1.markers
{% endraw %} def __Haplotype(self, data, haplotypes, mafs, varnames,recombPos,uniq_vars,exclude_vars): '''genetic haplotyping. haplotypes stores per family data''' # FIXME: it is SWIG's (2.0.12) fault not to properly destroy the object "Pedigree" in "Execute()" # So there is a memory leak here which I tried to partially handle on C++ # # Per family haplotyping # {% raw %}
tmp1.markers = ["V{}-{}".format(idx, item[1]) for idx, item in enumerate(data.variants)]
{% endraw %} {% raw %}
tmp_mafs = {}
if tmp1.freq_by_fam:
    ## if families are from different populations
    ## estimate MAF by different population
    fam_to_analyze={}
    for fam,pop in data.freq_by_fam.iteritems():
        if pop not in fam_to_analyze:
            fam_to_analyze[pop]=[fam]
        else:
            fam_to_analyze[pop].append(fam)
{% endraw %} {% raw %}
if tmp1.count:
    ## estimate MAF by counting founder alleles
    if tmp1.freq_by_fam:
        local_count_mafs={}
        for pop in fam_to_analyze:
            local_count_mafs[pop]=tmp1._MarkerMaker__computefounderfreq(data,fam_to_analyze[pop])
    else:
        local_count_mafs=tmp1._MarkerMaker__computefounderfreq(data,data.families.keys())
        print('run here')
{% endraw %} {% raw %}
local_count_mafs
{% endraw %} {% raw %}
tmp1.mle = True
{% endraw %} {% raw %}
if tmp1.mle:
    ## estimate MLE allele frequency using all fam
    local_mle_mafs={}
    if tmp1.freq_by_fam:
        for pop in fam_to_analyze:
            local_mle_mafs[pop]={}
            markers_to_analyze=[]
            pos_all=[]
            markers_analyzed={}
            if pop not in data.mle_mafs:
                data.mle_mafs[pop]={}
            else:
                for tmpv in data.mle_mafs[pop]:
                    markers_analyzed[tmpv.split('-')[-1]]=data.mle_mafs[pop][tmpv]
            output_log=env.tmp_log+"AF_{}_{}.log".format(pop,tmp1.name)
            popidx=tmp1.af_info.index(pop)
            variants_in_fams=[]
            for item in fam_to_analyze[pop]:
                for tmpvar in data.getFamVariants(item):
                    if tmpvar not in variants_in_fams:
                        variants_in_fams.append(tmpvar)
            variants_in_fams=sorted(variants_in_fams, key=lambda x: x[1])
            for item in variants_in_fams:
                idx=data.variants.index(item)
                if item[-1][popidx]==0:
                    if str(item[1]) in markers_analyzed.keys():
                        #if variant has been analyzed
                        vname="V{}-{}".format(idx,item[1])
                        local_mle_mafs[pop][vname]=markers_analyzed[str(item[1])]
                    else:
                        #variant not analyzed before
                        markers_to_analyze.append("V{}-{}".format(idx,item[1]))
                        pos_all.append(item[1])
            tmp_mle_mafs=tmp1._MarkerMaker__getMLEfreq(data, markers_to_analyze, pos_all, fam_to_analyze[pop], tmp1.rsq, output_log)
            if len(tmp_mle_mafs) > 0:
                for vname,vmaf in tmp_mle_mafs.iteritems():
                    data.mle_mafs[pop][vname]=vmaf
                    local_mle_mafs[pop][vname]=vmaf
    else:
        #Homogeneous families
        markers_to_analyze=[]
        pos_all=[]
        markers_analyzed={}
        for tmpv in data.mle_mafs:
            markers_analyzed[tmpv.split('-')[-1]]=data.mle_mafs[tmpv]
        variants_in_fams=[]
        for item in data.families.keys():
            var_per_fam=[tuple(tmpvar) for tmpvar in data.getFamVariants(item)]
            variants_in_fams=list(set(var_per_fam+variants_in_fams))
        variants_in_fams=[list(tmpvar) for tmpvar in sorted(variants_in_fams, key=lambda x: x[1])]
        for item in variants_in_fams:
            idx=data.variants.index(item)
            if item[-1]==0 or tmp1.af_info is None:
                if str(item[1]) in markers_analyzed.keys():
                    #if variant has been analyzed
                    vname="V{}-{}".format(idx,item[1])
                    local_mle_mafs[vname]=markers_analyzed[str(item[1])]
                else:
                    #variant not analyzed before
                    markers_to_analyze.append("V{}-{}".format(idx,item[1]))
                    pos_all.append(item[1])
        output_log=env.tmp_log+"AF_{}.log".format(tmp1.name)
        tmp_mle_mafs=tmp1._MarkerMaker__getMLEfreq(data, markers_to_analyze, pos_all, data.families.keys(), tmp1.rsq, output_log)
        if len(tmp_mle_mafs) > 0:
            for vname, vmaf in tmp_mle_mafs.iteritems():
                data.mle_mafs[vname]=vmaf
                local_mle_mafs[vname]=vmaf
{% endraw %} {% raw %}
tmp_mle_mafs
{% endraw %} {% raw %}
varnames
{% endraw %} {% raw %}
data.families
{% endraw %} {% raw %}
data.families.keys()[0]
{% endraw %} {% raw %}
data.getFamVariants(data.families.keys()[1],style="map")
{% endraw %} {% raw %}
data.famvaridx
{% endraw %} {% raw %}
data.famsampidx
{% endraw %} {% raw %}
gnomAD_pop=None
for item in data.families:
    varnames[item], positions, vcf_mafs = data.getFamVariants(item, style = "map")
    recombPos[item]={}
    var_for_haplotype=[]
    positions_for_haplotype=[]
    output_sample=[]
    if env.debug:
        with env.lock:
            sys.stderr.write('\n'+repr(varnames[item])+'\n')
            sys.stderr.write('\n'.join(['\t'.join(x) for x in data.getFamSamples(item)]) + '\n\n')
    # either use privided MAF or compute MAF
    if tmp1.freq_by_fam:
        mafs[item]={}
        tfreq_fam=data.freq_by_fam[item]
        for pop in data.gnomAD_estimate.keys():
            if pop in tfreq_fam:
                gnomAD_pop=pop
                break
    elif gnomAD_pop is None and data.freq is not None:
        for pop in data.gnomAD_estimate.keys():
            if pop in data.freq:
                gnomAD_pop=pop
                break
    for idx, v in enumerate(varnames[item]):
        tmp_maf_var=0
        if tmp1.af_info is None:
        #no vcf freq column specified
            if v not in tmp_mafs:
                if tmp1.mle:
                #use MLE freq for all variants
                    tmp_mafs[v]=local_mle_mafs[v]
                elif tmp1.count:
                #estimate MAF based on founder counts if MLE not specified
                    tmp_mafs[v]=local_count_mafs[v]
                tmp_maf_var=tmp_mafs[v]
        elif not tmp1.af_info is None:
            #if vcf freq column is specified
            #use vcf_mafs if possible
            if vcf_mafs[idx]:
                tmp_maf_var=vcf_mafs[idx]
                if tmp1.freq_by_fam:
                    mafs[item][v] = vcf_mafs[idx]
                else:
                    if v not in tmp_mafs:
                        tmp_mafs[v] = vcf_mafs[idx]
            else:
                #if variants do not have valid vcf_mafs values if specified
                if tmp1.freq_by_fam:
                    if gnomAD_pop is not None:
                        mafs[item][v]=data.gnomAD_estimate[gnomAD_pop]
                    elif tmp1.mle:
                            mafs[item][v]=local_mle_mafs[data.freq_by_fam[item]][v]
                    elif tmp1.count:
                            mafs[item][v]=local_count_mafs[data.freq_by_fam[item]][v]
                    tmp_maf_var=mafs[item][v]
                else:
                    if v not in tmp_mafs:
                        if gnomAD_pop is not None:
                            tmp_mafs[v]=data.gnomAD_estimate[gnomAD_pop]
                        elif tmp1.mle:
                            tmp_mafs[v]=local_mle_mafs[v]
                        elif tmp1.count:
                            tmp_mafs[v]=local_count_mafs[v]
                    tmp_maf_var=tmp_mafs[v]
        if tmp1.rvhaplo:
            if tmp_maf_var<=tmp1.maf_cutoff:
                var_for_haplotype.append(v)
                positions_for_haplotype.append(positions[idx])
    if not tmp1.rvhaplo:
        var_for_haplotype=varnames[item]
        positions_for_haplotype=positions
    #collect sample+genotypes
    for person in data.tfam.sort_family(item):
        output_sample.append([])
        last_ele=len(output_sample)-1
        output_sample[last_ele] = data.tfam.samples[person][:-1]
        if person in data.samples:
            for marker in var_for_haplotype:
                idx=int(marker.split('-')[0][1:])
                output_sample[last_ele].append(data.genotype_all[person][idx])
        else:
            output_sample[last_ele].extend(["00"] * len(var_for_haplotype))
    # haplotyping
    if len(var_for_haplotype)==0:
        varnames.pop(item,None)
        #for person in data.families[item]:
        #    data[person] = tmp1.missings
        continue
    for person in output_sample:
        if set(person[5:])==set(['00']):
            data.missing_persons.append(person[1])
    with env.lock:
        if not env.prephased:
            tmp_log_output=env.tmp_log + str(os.getpid())
            with stdoutRedirect(to = tmp_log_output + '.log'):
                haplotypes[item] = tmp1.haplotyper.Execute(data.chrom, var_for_haplotype, positions_for_haplotype, output_sample, tmp1.rsq, tmp_log_output)[0]
        else:
            haplotypes[item] = tmp1.__PedToHaplotype(data.getFamSamples(item))
    if len(haplotypes[item]) == 0:
        # C++ haplotyping implementation failed
        with env.chperror_counter.get_lock():
            env.chperror_counter.value += 1
    varnames[item]=var_for_haplotype
    
for item in haplotypes:
    for hap_idx,haploid in enumerate(haplotypes[item]):
        for vidx,var in enumerate(haploid[2:]):
            if not var.endswith(':') and not var.endswith('|') and vidx!=0:
                postvar_name=varnames[item][vidx]
                prevar_name=varnames[item][vidx-1]
                recomb_pair = (prevar_name,postvar_name)
                try:
                    recombPos[item][recomb_pair].append(hap_idx)
                except:
                    recombPos[item][recomb_pair]=[hap_idx]
#
# Compute founder MAFs
#
if len(tmp_mafs) > 0:
    if tmp1.freq_by_fam:
        for pop in tmp_mafs:
            for v in tmp_mafs[pop]:
                if type(tmp_mafs[pop][v]) is list:
                    tmp_mafs[pop][v] = tmp_mafs[pop][v][0]/tmp_mafs[pop][v][1] if tmp_mafs[pop][v][1] >0 else 0.0
    else:
        for v in tmp_mafs:
            if type(tmp_mafs[v]) is list:
                tmp_mafs[v] = tmp_mafs[v][0]/tmp_mafs[v][1] if tmp_mafs[v][1] > 0 else 0.0
## Make mafs consistent in structure regardless of freq_by_fam
if tmp1.freq_by_fam:
    for item in haplotypes:
        popname=data.freq_by_fam[item]
        if popname not in tmp_mafs:
            continue
        if item not in mafs:
            mafs[item]=tmp_mafs[popname]
        else:
            for v in tmp_mafs[popname]:
                if v not in mafs[item]:
                    mafs[item][v]=tmp_mafs[popname][v]
else:
    for item in haplotypes:
        mafs[item]=tmp_mafs
if env.debug:
    with env.lock:
        print("variant mafs = ", mafs, "\n", file = sys.stderr)
##
#
# Drop some variants if maf is greater than given threshold
#
if not tmp1.maf_cutoff is None or tmp1.single_markers:
    if tmp1.freq_by_fam:
        exclude_vars=[[] for x in range(len(data.freq))]
    for i in haplotypes.keys():
        if tmp1.freq_by_fam:
            pop_idx=data.freq.index(data.freq_by_fam[i])
            tmp_exclude_vars=exclude_vars[pop_idx]
        else:
            tmp_exclude_vars=exclude_vars
        for v in mafs[i].keys():
            if not tmp1.maf_cutoff is None:
                if mafs[i][v] > tmp1.maf_cutoff and v not in tmp_exclude_vars or v.split('-')[-1] not in data.include_vars:
                    tmp_exclude_vars.append(v)
            if tmp1.single_markers:
                if v.split('-')[-1] not in data.include_vars:
                    tmp_exclude_vars.append(v)
        haplotypes[i] = listit(haplotypes[i])
        tmp_remain_vars=[x for x in varnames[i] if x not in tmp_exclude_vars]
        recomb_remain_vars=[]
        if len(tmp_remain_vars) == 0:
            recombPos[i]={}
        else:
            if len(recombPos[i]) > 0:
                #extend recombination signal to neighbouring RVs
                #if the original variant is to be excluded
                #Only allow a maximum of one recombination event between one pair of consecutive markers
                for pair in recombPos[i].keys():
                    if pair[1] not in tmp_exclude_vars:
                        if tmp_remain_vars.index(pair[1])!=0 and pair[1] not in recomb_remain_vars:
                            recomb_remain_vars.append(pair[1])
                        else:
                            del recombPos[i][pair]
                    else:
                        if varnames[i].index(pair[1]) > varnames[i].index(tmp_remain_vars[-1]):
                            #last variant
                            del recombPos[i][pair]
                            continue
                        for tmp_idx in range(varnames[i].index(pair[1])+1,len(varnames[i])):
                            if varnames[i][tmp_idx] not in tmp_exclude_vars:
                                if tmp_remain_vars.index(varnames[i][tmp_idx])==0:
                                    #delete recombination pair if the recombination was marked to the first remaining variant
                                    del recombPos[i][pair]
                                    break
                                for tmp_hap in recombPos[i][pair]:
                                    tmp_var=haplotypes[i][tmp_hap][tmp_idx+2]
                                    if tmp_var.endswith(':') or tmp_var.endswith('|'):
                                        haplotypes[i][tmp_hap][tmp_idx+2]=tmp_var[:-1]+'/'
                                if varnames[i][tmp_idx] not in recomb_remain_vars:
                                    recomb_remain_vars.append(varnames[i][tmp_idx])
                                else:
                                    del recombPos[i][pair]
                                break
        for j in range(len(haplotypes[i])):
            haplotypes[i][j] = haplotypes[i][j][:2] + \
              [x for idx, x in enumerate(haplotypes[i][j][2:]) if varnames[i][idx] not in tmp_exclude_vars]
        for tmp_var in varnames[i]:
            if tmp_var not in uniq_vars:
                     uniq_vars.append(tmp_var)
        varnames[i] = [x for x in varnames[i] if x not in tmp_exclude_vars]
        # handle trivial data
        if len(varnames[i]) == 0:
            del varnames[i]
            del haplotypes[i]
        if len(recombPos[i].keys())>tmp1.recomb_max:
            #treat as missing if recombination events occurred more than speicified times
            recombPos[i]={}
            for person in data.families[i]:
                data[person] = tmp1.missings
            del varnames[i]
            del haplotypes[i]
    # count how many variants are removed
    with env.commonvar_counter.get_lock():
        if tmp1.freq_by_fam:
            tmp_ex_vars=[tmp_var for tmp_vars in exclude_vars for tmp_var in tmp_vars]
            env.commonvar_counter.value += len(set(tmp_ex_vars))
        else:
            env.commonvar_counter.value += len(exclude_vars)
    # get total observed variants
    if tmp1.freq_by_fam:
        for item in varnames:
            pop=data.freq_by_fam[item]
            if pop not in data.total_mafs:
                data.total_mafs[pop]={}
                data.total_varnames[pop]=[]
            for v in varnames[item]:
                if v not in data.total_mafs[pop]:
                    data.total_varnames[pop].append(v)
                    data.total_mafs[pop][v]=mafs[item][v]
        for pop in data.total_varnames:
            data.total_varnames[pop]=sorted(data.total_varnames[pop], key=lambda x: int(x.split("-")[0][1:]))
            data.wt_maf[pop]=1.0
            for v,tmaf in data.total_mafs[pop].iteritems():
                data.wt_maf[pop]*=(1-tmaf)
    else:
        data.total_varnames['pop']=[]
        for item in varnames:
            for v in varnames[item]:
                if v not in data.total_mafs:
                    data.total_varnames['pop'].append(v)
                    data.total_mafs[v]=mafs[item][v]
        data.wt_maf['pop']=1.0
        for v,tmaf in data.total_mafs.iteritems():
            data.wt_maf['pop']*=(1-tmaf)
        data.total_varnames['pop']=sorted(data.total_varnames['pop'], key=lambda x: int(x.split("-")[0][1:]))
{% endraw %} {% raw %}
data.total_varnames
{% endraw %} {% raw %}
??cstatgen.HaplotypingEngine()
{% endraw %} {% raw %}
data.wt_maf
{% endraw %} {% raw %}
aa = []
for _ in range(1000):
    a = queue.get()
    tmp.getRegion(a)
    tmp.apply(dd)
    if len(dd.variants) != 0:
        aa.append(a)
{% endraw %} {% raw %}
len(aa)
{% endraw %} {% raw %}
tmp.getRegion(aa[0])
dd = deepcopy(data)
tmp.apply(dd)
{% endraw %} {% raw %}
tmp1.apply(dd)
{% endraw %} {% raw %}
tmp1._MarkerMaker__Haplotype
{% endraw %} {% raw %}
tmp1.freq_by_fam
{% endraw %} {% raw %}
tmp1.count
{% endraw %} {% raw %}
tmp1.mle
{% endraw %} {% raw %}
tmp1.rvhaplo
{% endraw %} {% raw %}
tmp1.apply(dd)
tmp2.apply(dd)
{% endraw %} {% raw %}
tmp1.freq_by_fam
{% endraw %} {% raw %}
tmp1.count
{% endraw %} {% raw %}
env.prephased
{% endraw %} {% raw %}
env.tmp_log
{% endraw %} {% raw %}
haplotypes = OrderedDict()
mafs = {}   ##Per fam per variant
uniq_vars = []
exclude_vars = []
varnames = {}
recombPos = {}
tmp1._MarkerMaker__Haplotype(dd, haplotypes, mafs, varnames,recombPos,uniq_vars,exclude_vars)
{% endraw %} {% raw %}
tm
{% endraw %} {% raw %}
tmp1.recomb_perfam
{% endraw %} {% raw %}
tmp1.apply(dd)
{% endraw %} {% raw %}
tmp2.apply(dd)
{% endraw %} {% raw %}
data.variants
{% endraw %} {% raw %}
dd.include_vars
{% endraw %} {% raw %}
dd.variants
{% endraw %} {% raw %}
tmp1.getRegion(aa[0])
tmp1.apply(dd)
{% endraw %} {% raw %}
env.debug =True
{% endraw %} {% raw %}
data.variants
{% endraw %} {% raw %}
dd.chrom
{% endraw %} {% raw %}
dd.variants
{% endraw %} {% raw %}
tmp.vcf
{% endraw %} {% raw %}
??EncoderWorker
{% endraw %} {% raw %}
env.total_counter.value
{% endraw %} {% raw %}
jobs
{% endraw %} {% raw %}
for j in jobs:
    j.start()
for j in jobs:
    j.join()
faulthandler.disable()
{% endraw %} {% raw %}
    try:
        faulthandler.enable(file=open(env.tmp_log + '.SEGV', 'w'))
        for i in regions:
            if isinstance(queue, list):
                queue.append(i)
            else:
                queue.put(i)
        freq_by_fam_flag = False
        if not args.freq_by_fam is None:
            print('haha')
            freq_by_fam_flag = True
            with open(args.freq_by_fam) as freq_fh:
                for freq_line in freq_fh:
                    tmp_eles=freq_line.split()   #Fam and Population
                    data.freq_by_fam[tmp_eles[0]]=tmp_eles[1]
            data.freq=sorted(list(set(data.freq_by_fam.values())))
        else:
            data.freq=args.freq
        jobs = [EncoderWorker(
            queue, len(regions), deepcopy(data),
            RegionExtractor(args.vcf, chr_prefix = args.chr_prefix, allele_freq_info = data.freq, include_vars_file=args.include_vars),
            MarkerMaker(args.bin, maf_cutoff = args.maf_cutoff,single_markers=args.single_markers,recomb_max=args.recomb_max,af_info=data.freq,freq_by_fam=freq_by_fam_flag,rsq=args.rsq,mle=args.mle, rvhaplo=args.rvhaplo, recomb_perfam=not args.recomb_cross_fam),
            LinkageWriter(len(samples_not_vcf))
            ) for i in range(env.jobs)]
        for j in jobs:
            j.start()
        for j in jobs:
            j.join()
        faulthandler.disable()
    except KeyboardInterrupt:
        # FIXME: need to properly close all jobs
        raise ValueError("Use 'killall {}' to properly terminate all processes!".format(env.prog))
    else:
        env.log('{:,d} units (from {:,d} variants) processed; '\
            '{:,d} Mendelian inconsistencies and {:,d} recombination events handled\n'.\
            format(env.success_counter.value,
                   env.variants_counter.value,
                   env.mendelerror_counter.value,
                   env.recomb_counter.value), flush = True)
        if env.triallelic_counter.value:
            env.log('{:,d} tri-allelic loci were ignored'.format(env.triallelic_counter.value))
        if env.commonvar_counter.value:
            env.log('{:,d} variants ignored due to having MAF > {} and other specified constraints'.\
                    format(env.commonvar_counter.value, args.maf_cutoff))
        if env.null_counter.value:
            env.log('{:,d} units ignored due to absence in VCF file'.format(env.null_counter.value))
        if env.trivial_counter.value:
            env.log('{:,d} units ignored due to absence of variation in samples'.format(env.trivial_counter.value))
        fatal_errors = 0
        try:
            # Error msg from C++ extension
            os.system("cat {}/*.* > {}".format(env.tmp_dir, env.tmp_log))
            fatal_errors = wordCount(env.tmp_log)['fatal']
        except KeyError:
            pass
        if env.chperror_counter.value:
            env.error("{:,d} regional markers failed to be generated due to haplotyping failures!".\
                      format(env.chperror_counter.value))
        if fatal_errors:
            env.error("{:,d} or more regional markers failed to be generated due to runtime errors!".\
                      format(fatal_errors))
        env.log('Archiving regional marker data to directory [{}]'.format(env.cache_dir))
        cache.write(arcroot = 'CACHE', source_dir = env.tmp_cache)
env.jobs = args.jobs
{% endraw %} {% raw %}
tpeds = [os.path.join(env.tmp_cache, item) for item in os.listdir(env.tmp_cache) if item.startswith(env.output) and item.endswith('.tped')]
for fmt in args.format:
    cache.setID(fmt)
    if not args.vanilla and cache.check():
        env.log('Loading {} data from archive ...'.format(fmt.upper()))
        cache.load(target_dir = env.tmp_dir, names = [fmt.upper()])
    else:
        env.log('{:,d} units will be converted to {} format'.format(env.success_counter.value, fmt.upper()))
        env.format_counter.value = 0
        format(tpeds, os.path.join(env.tmp_cache, "{}.tfam".format(env.output)),
               args.prevalence, args.wild_pen, args.muta_pen, fmt,
               args.inherit_mode, args.theta_max, args.theta_inc)
        env.log('{:,d} units successfully converted to {} format\n'.\
                format(env.format_counter.value, fmt.upper()), flush = True)
        if env.skipped_counter.value:
            # FIXME: perhaps we need to rephrase this message?
            env.log('{} region - family pairs skipped'.\
                    format(env.skipped_counter.value))
        env.log('Archiving {} format to directory [{}]'.format(fmt.upper(), env.cache_dir))
        cache.write(arcroot = fmt.upper(),
                    source_dir = os.path.join(env.tmp_dir, fmt.upper()), mode = 'a')
mkpath(env.outdir)
if args.run_linkage:
    cache.setID('analysis')
    if not args.vanilla and cache.check():
        env.log('Loading linkage analysis result from archive ...'.format(fmt.upper()))
        cache.load(target_dir = env.output, names = ['heatmap'])
    else:
        env.log('Running linkage analysis ...'.format(fmt.upper()))
        run_linkage(args.blueprint, args.theta_inc, args.theta_max, args.output_limit)
        env.log('Linkage analysis succesfully performed for {:,d} units\n'.\
                format(env.run_counter.value, fmt.upper()), flush = True)
        if env.makeped_counter.value:
            env.log('{} "makeped" runtime errors occurred'.format(env.makeped_counter.value))
        if env.pedcheck_counter.value:
            env.log('{} "pedcheck" runtime errors occurred'.format(env.pedcheck_counter.value))
        if env.unknown_counter.value:
            env.log('{} "unknown" runtime errors occurred'.format(env.unknown_counter.value))
        if env.mlink_counter.value:
            env.log('{} "mlink" runtime errors occurred'.format(env.mlink_counter.value))
        cache.write(arcroot = 'heatmap', source_dir = os.path.join(env.output, 'heatmap'), mode = 'a')
    html(args.theta_inc, args.theta_max, args.output_limit)
else:
    env.log('Saving data to [{}]'.format(os.path.abspath(env.output)))
    cache.load(target_dir = env.output, names = [fmt.upper() for fmt in args.format])
{% endraw %} {% raw %}
Args.get()
{% endraw %}