Contents

runMLZΒΆ

This is the main file to run MLZ, this read the Input file template and get all the configuration from it. It prints the time spent on all the steps.
#!/usr/bin/env python
__author__ = 'Matias Carrasco Kind'
from numpy import *
import copy
import random as rn
import os, sys

try:
    from ml_codes import *
    from utils import *
except:
    from mlz.ml_codes import *
    from mlz.utils import *
SF90 = True
try:
    import ml_codes.somF
except:
    SF90 = False
if not SF90:
    try:
        import somF

        SF90 = True
    except:
        pass
sys.setrecursionlimit(8000)
try:
    from mpi4py import MPI

    PLL = 'MPI'
except ImportError:
    PLL = 'SERIAL'

if PLL == 'MPI':
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
else:
    size = 1
    rank = 0

if rank == 0:
    utils_mlz.print_welcome()
    clock_all = utils_mlz.Stopwatch()
Nproc = size #number of processors
Narg = len(sys.argv)
if Narg != 2:
    if rank == 0: utils_mlz.usage()
    if PLL == 'MPI': MPI.Finalize()
    sys.exit(0)

FilePars = sys.argv[1] #inputs file name
#READ PARAMETERS
verbo = False
if rank == 0: verbo = True
Pars_in = utils_mlz.read_dt_pars(FilePars, verbose=verbo)
if Pars_in.varimportance == 'yes': Pars_in.ooberror == 'yes'
if Pars_in.checkonly == 'yes':
    if rank == 0:
        utils_mlz.printpz()
        utils_mlz.printpz('********************************')
        utils_mlz.printpz('* Check Mode only for testing  *')
        utils_mlz.printpz('********************************')
        utils_mlz.printpz()

if rank == 0:
    utils_mlz.printpz("Starting with ", size, " processors")
    utils_mlz.printpz()
    local_clock = utils_mlz.Stopwatch('no')




Pmode = Pars_in.predictionmode
if Pmode == 'TPZ_C': Pars_in.predictionclass = 'Class'
Cmode = Pars_in.predictionclass
#READ TRAIN AND TEST FILES
if Pars_in.dotrain == 'yes':
    Train = data.catalog(Pars_in, cat_type='train',rank=rank)

    if Pars_in.nrandom > 1:
        if rank == 0: Train.make_random(ntimes=int(Pars_in.nrandom))
    if PLL == 'MPI': comm.Barrier()
    if Pars_in.nrandom > 1: Train.load_random()
    if PLL == 'MPI': comm.Barrier()  ##<>##

    if rank == 0:
        utils_mlz.printpz('-> NUMBER OF GALAXIES IN TRAIN CATALOG : ', len(Train.cat))

    if rank == 0:
        utils_mlz.print_mode(Pmode)
        utils_mlz.print_mode(Cmode)
        if not SF90 and Pmode == 'SOM':
            utils_mlz.printpz()
            utils_mlz.printpz('******************************************************')
            utils_mlz.printpz('* Fortran module somF not found, using python module *')
            utils_mlz.printpz('*   try: f2py -c -m somF som.f90 to compile it       *')
            utils_mlz.printpz('******************************************************')
        utils_mlz.printpz()

    if Pars_in.importancefile == 'none':
        Pars_in.importance = ones(len(Pars_in.att))
        Pars_in.importance_all = ones(len(Pars_in.att))
    else:
        Pars_in.importance = loadtxt(Pars_in.importancefile)
        Pars_in.importance_all = loadtxt(Pars_in.importancefile)

    oob = {}
    out = {}

    #TRAIN FIRST
    ntot = int(Pars_in.nrandom * Pars_in.ntrees)
    if rank == 0: utils_mlz.printpz('Total trees (maps) : ', str(ntot))
    s0, s1 = utils_mlz.get_limits(ntot, Nproc, rank)
    if rank == 0:
        for i in xrange(Nproc):
            Xs_0, Xs_1 = utils_mlz.get_limits(ntot, Nproc, i)
            utils_mlz.printpz(Xs_0, ' ', Xs_1, ' -------------> to core ', i)

    for kss in xrange(s0, s1):
        if Pars_in.nrandom > 1:
            ir = kss / int(Pars_in.ntrees)
            if ir != 0: Train.newcat(ir)
        if Pmode == 'SOM': DD = Train.sample_dim(int(Pars_in.natt))
        if Pmode == 'TPZ': DD = 'all'
        if Pmode == 'TPZ_C': DD = 'all'
        if Pars_in.ooberror == 'yes': Train.oob_data_cat()

        Train.get_XY(bootstrap='yes', curr_at=DD)
        if DD != 'all':
            impp = []
            for jk, ik in enumerate(Pars_in.att):
                if DD.has_key(ik): impp.append(Pars_in.importance_all[jk])
            Pars_in.importance = array(impp)

        if Pmode == 'TPZ':
            T = TPZ.Rtree(Train.X, Train.Y, forest='yes', minleaf=int(Pars_in.minleaf), mstar=int(Pars_in.natt),
                          dict_dim=DD)
            T.save_tree(kss, fileout=Train.Pars.treefilename, path=Train.Pars.path_output_trees)
        if Pmode == 'TPZ_C':
            YC = arange(int(Pars_in.minz), int(Pars_in.maxz) + 1, dtype='int')
            T = TPZ.Ctree(Train.X, Train.Y, forest='yes', minleaf=int(Pars_in.minleaf), mstar=int(Pars_in.natt),
                          dict_dim=DD,
                          impurity=Pars_in.impurityindex, nclass=YC)
            T.save_tree(kss, fileout=Train.Pars.treefilename, path=Train.Pars.path_output_trees)
        if Pmode == 'SOM':
            aps = Pars_in.alphastart
            ape = Pars_in.alphaend
            T = SOMZ.SelfMap(Train.X, Train.Y, Ntop=int(Pars_in.ntop), topology=Pars_in.topology, som_type=Pars_in.somtype,
                             iterations=int(Pars_in.iterations), periodic=Pars_in.periodic, dict_dim=DD, astart=aps,
                             aend=ape,
                             importance=Pars_in.importance)
            if SF90:
                T.create_mapF()
            else:
                T.create_map()
            T.evaluate_map(inputX=Train.X, inputY=Train.Y)
            T.save_map(kss, fileout=Train.Pars.somfilename, path=Train.Pars.path_output_maps)

        #OOB DATA
        if Pars_in.ooberror == 'yes':
            for io in xrange(len(Train.Xoob)):
                ij = Train.oob_index_or[io]
                if not oob.has_key(ij):
                    oob[ij] = {}
                    oob[ij]['zp'] = []
                    oob[ij]['zs'] = Train.Yoob[io]
                tempo = T.get_vals(Train.Xoob[io])
                if tempo[0] != -1.:
                    for kt in tempo: oob[ij]['zp'].append(kt)
        if Pars_in.varimportance == 'yes':
            oobV = {}
            for ka in xrange(len(Train.indx)):
                k_name = Train.cols[Train.indx[ka]]
                oobV[k_name] = {}
                Xoob = copy.deepcopy(Train.Xoob)
                indexp = rn.sample(xrange(len(Xoob)), len(Xoob))
                Xoob[:, ka] = Train.Xoob[indexp, ka]
                for io in xrange(len(Train.Xoob)):
                    ij = Train.oob_index_or[io]
                    if not oobV[k_name].has_key(ij):
                        oobV[k_name][ij] = {}
                        oobV[k_name][ij]['zp'] = []
                        oobV[k_name][ij]['zs'] = Train.Yoob[io]
                    tempo = T.get_vals(Xoob[io])
                    if tempo[0] != -1.:
                        for kt in tempo: oobV[k_name][ij]['zp'].append(kt)

    del T
    if PLL == 'MPI': comm.Barrier()

    zfine, zfine2, resz, resz2, wzin = analysis.get_zbins(Pars_in)
    zfine2 = zfine2[wzin]
    train_nobj = Train.nobj
    del Train


    def join_oob(rank, oob, PLL):
        if rank == 0:
            Boob = copy.deepcopy(oob)
            for srank in xrange(1, Nproc):
                if PLL == 'MPI': oob = comm.recv(source=srank, tag=srank * 2)
                for j in xrange(train_nobj):
                    if oob.has_key(j):
                        vals = oob[j]['zp']
                        zst = oob[j]['zs']
                        if Boob.has_key(j):
                            for v in vals: Boob[j]['zp'].append(v)
                        else:
                            Boob[j] = {}
                            Boob[j]['zp'] = vals
                            Boob[j]['zs'] = zst
        else:
            if PLL == 'MPI': comm.send(oob, dest=0, tag=2 * rank)
        if PLL == 'MPI': comm.Barrier()
        del oob
        if rank == 0:
            return Boob
        else:
            return 0.


    if Pars_in.ooberror == 'yes':
        if rank == 0:
            utils_mlz.printpz()
            utils_mlz.printpz("Cross validation with OOB data")
            utils_mlz.printpz()

        Boob = join_oob(rank, oob, PLL)
        del oob
        if rank == 0:
            oob_pz = analysis.GetPz(Boob, train_nobj, Pars_in)
            Z0b, BP0b = oob_pz.compute()
            analysis.save_single(Z0b, Pars_in, oob='yes')
            del Z0b
            if Cmode == 'Reg': analysis.save_PDF(zfine2, BP0b, Pars_in, oob='yes')
            del BP0b, Boob

    if Pars_in.varimportance == 'yes':
        if rank == 0: utils_mlz.printpz("Computing importance ranking")
        if PLL == 'MPI': comm.Barrier()
        for ka in Pars_in.att:
            Btemp = join_oob(rank, oobV[ka], PLL)
            if rank == 0:
                oob_pz = analysis.GetPz(Btemp, train_nobj, Pars_in)
                Z0b = oob_pz.compute(do_pdf='no')
                analysis.save_single(Z0b, Pars_in, oob='yes', var='_' + ka)
                del Z0b, Btemp
        del oobV

    if PLL == 'MPI': comm.Barrier()
    if rank == 0:
        utils_mlz.printpz()
        utils_mlz.printpz('+-+-+-+-+-+-+-+-+-+-+-+-+')
        utils_mlz.printpz(PLL, ' time Training')
        utils_mlz.printpz('+-+-+-+-+-+-+-+-+-+-+-+-+')
        local_clock.elapsed()

if Pars_in.dotest == 'no':
    if rank == 0:
        utils_mlz.printpz()
        utils_mlz.printpz("Only training was selected")
        utils_mlz.printpz("Check utils/utils_mlz.py ")
        utils_mlz.printpz()
    if PLL == 'MPI': MPI.Finalize()
    sys.exit(0)

#------------------------------------------------------------------------
# NOW TEST
zfine, zfine2, resz, resz2, wzin = analysis.get_zbins(Pars_in)
zfine2 = zfine2[wzin]
ntot = int(Pars_in.nrandom * Pars_in.ntrees)
if rank == 0:
    local_clock = utils_mlz.Stopwatch('no')
    cat_temp = data.read_catalog(Pars_in.path_test + Pars_in.testfile, check=Pars_in.checkonly)
    Ng = array(len(cat_temp), 'i')
    del cat_temp
else:
    Ng = array(0, 'i')

if PLL == 'MPI': comm.Barrier()
if PLL == 'MPI': comm.Bcast([Ng, MPI.INT], root=0)

s0, s1 = utils_mlz.get_limits(Ng, Nproc, rank)
if rank == 0:
    utils_mlz.printpz('-> NUMBER OF GALAXIES IN TEST CATALOG : ', Ng)
    for i in xrange(Nproc):
        Xs_0, Xs_1 = utils_mlz.get_limits(Ng, Nproc, i)
        utils_mlz.printpz(Xs_0, ' ', Xs_1, ' -------------> to core ', i)

Test = data.catalog(Pars_in, cat_type='test', L1=s0, L2=s1, rank=rank)

if Pmode == 'TPZ': path1 = Test.Pars.path_output_trees
if Pmode == 'TPZ_C': path1 = Test.Pars.path_output_trees
if Pmode == 'SOM': path1 = Test.Pars.path_output_maps

if Pmode == 'TPZ': fileb = Pars_in.treefilename
if Pmode == 'TPZ_C': fileb = Pars_in.treefilename
if Pmode == 'SOM': fileb = Pars_in.somfilename

Test.get_XY()

Z0 = zeros((Test.nobj, 7))
if Cmode == 'Reg':
    BP0 = zeros((Test.nobj, len(zfine2)))
    BP0raw = zeros((Test.nobj, len(zfine) - 1))
    Test_S = analysis.GetPz_short(Pars_in)
if Cmode == 'Class':
    S1 = zeros(Test.nobj)
    S2 = zeros(Test.nobj)
    Nv = zeros(Test.nobj)

for k in xrange(ntot):
    ff = '_%04d' % k
    filec = path1 + fileb + ff + '.npy'
    S = load(filec)
    S = S.item()
    DD = S.dict_dim
    if Pmode == 'SOM': Test.get_XY(curr_at=DD)
    for i in xrange(Test.nobj):
        temp = S.get_vals(Test.X[i])
        if temp[0] != -1.:
            if Cmode == 'Reg': BP0raw[i, :] += Test_S.get_hist(temp)
            if Cmode == 'Class':
                S1[i] += sum(array(temp))
                S2[i] += sum(array(temp) * array(temp))
                Nv[i] += 1. * len(temp)

if Test.has_Y():
    yvals = Test.Y
else:
    yvals = zeros(Test.nobj)

if Cmode == 'Reg':
    for k in xrange(Test.nobj):
        z_phot, pdf_phot = Test_S.get_pdf(BP0raw[k], yvals[k])
        Z0[k, :] = z_phot
        BP0[k, :] = pdf_phot
    del BP0raw, yvals
if Cmode == 'Class':
    z_0, z_1, s_0 = analysis.class_stat(S1, S2, Nv, Pars_in)
    Z0[:, 0] = yvals
    Z0[:, 1] = z_0
    Z0[:, 2] = z_1
    Z0[:, 3] = s_0

if rank == 0:
    utils_mlz.printpz()
    utils_mlz.printpz('+-+-+-+-+-+-+-+-+-+-+-+-+')
    utils_mlz.printpz(PLL, ' time Testing')
    utils_mlz.printpz('+-+-+-+-+-+-+-+-+-+-+-+-+')
    local_clock.elapsed()

if PLL == 'MPI': comm.Barrier()
####################################

if rank == 0:
    BIGZ = zeros((Ng, 7))
    BIGZ[s0:s1, :] = Z0
    for srank in xrange(1, Nproc):
        s0, s1 = utils_mlz.get_limits(Ng, Nproc, srank)
        size_dat = s1 - s0
        ZT = zeros((size_dat, 7))
        if PLL == 'MPI': comm.Recv(ZT, source=srank, tag=srank * 2)
        BIGZ[s0:s1, :] = ZT
        del ZT
else:
    if PLL == 'MPI': comm.Send(Z0, dest=0, tag=rank * 2)
    del Z0
if PLL == 'MPI': comm.Barrier()

if rank == 0:
    analysis.save_single(BIGZ, Pars_in)
    del BIGZ

if Cmode == 'Reg' and Pars_in.writepdf == 'yes':
    if Pars_in.multiplefiles == 'yes':
        if rank == 0:
            utils_mlz.printpz()
            utils_mlz.printpz('************************************')
            utils_mlz.printpz('** Writing multiple file for PDFs **')
            utils_mlz.printpz('************************************')
            utils_mlz.printpz()
        path_r, filebase_r, num_r = analysis.get_path_new(Pars_in, rank)
        analysis.save_PDF(zfine2, BP0, Pars_in, path=path_r, filebase=filebase_r, num=num_r, multiple='yes', rank=rank)
    else:
        s0, s1 = utils_mlz.get_limits(Ng, Nproc, rank)
        if rank == 0:
            BIGP = zeros((Ng, len(zfine2)))
            BIGP[s0:s1, :] = BP0
            for srank in xrange(1, Nproc):
                s0, s1 = utils_mlz.get_limits(Ng, Nproc, srank)
                size_dat = s1 - s0
                BPT = zeros((size_dat, len(zfine2)))
                if PLL == 'MPI': comm.Recv(BPT, source=srank, tag=srank * 2)
                BIGP[s0:s1, :] = BPT
                del BPT
        else:
            if PLL == 'MPI': comm.Send(BP0, dest=0, tag=rank * 2)
        if PLL == 'MPI': comm.Barrier()
        if rank == 0:
            analysis.save_PDF(zfine2, BIGP, Pars_in)
            del BIGP

if rank == 0:
    utils_mlz.printpz('+-+-+-+-+-+-+-+-+')
    utils_mlz.printpz(PLL, ' TOTAL TIME')
    utils_mlz.printpz('+-+-+-+-+-+-+-+-+')
    clock_all.elapsed()
if PLL == 'MPI': MPI.Finalize()

Contents