Package BIP :: Package Bayes :: Module Melding
[hide private]
[frames] | no frames]

Source Code for Module BIP.Bayes.Melding

  1  # -*- coding:utf-8 -*- 
  2  #----------------------------------------------------------------------------- 
  3  # Name:        Melding.py 
  4  # Purpose:     The Bayesian melding Class provides 
  5  #              uncertainty analyses for simulation models. 
  6  # 
  7  # Author:      Flávio Codeço Coelho 
  8  # 
  9  # Created:     2003/08/10 
 10  # Copyright:   (c) 2003-2008 by the Author 
 11  # Licence:     GPL 
 12  #----------------------------------------------------------------------------- 
 13  from numpy.core.records import recarray 
 14  try: 
 15      import psyco 
 16      psyco.full() 
 17  except: 
 18      pass 
 19  import sys 
 20  import os 
 21  import cPickle as CP 
 22  import like 
 23  import pylab as P 
 24  from scipy.stats.kde import gaussian_kde 
 25  from scipy import stats 
 26  import numpy 
 27  from numpy import array, nan_to_num, zeros, product, exp, ones,mean, var 
 28  from time import time 
 29  from numpy.random import normal, randint,  random, seed 
 30  try: 
 31      from BIP.Viz.realtime import RTplot 
 32      Viz=True 
 33  except: 
 34      Viz=False 
 35      print r"""Please install Gnuplot-py to enable realtime visualization. 
 36      http://gnuplot-py.sourceforge.net/ 
 37      """ 
 38  import lhs 
 39   
 40  from multiprocessing import Pool 
 41   
 42  __docformat__ = "restructuredtext en" 
 43   
 44   
45 -class Meld:
46 """ 47 Bayesian Melding class 48 """
49 - def __init__(self, K, L, model, ntheta, nphi, alpha = 0.5, verbose = False, viz=False ):
50 """ 51 Initializes the Melding class. 52 53 :Parameters: 54 - `K`: Number of replicates of the model run. Also determines the prior sample size. 55 - `L`: Number of samples from the Posterior distributions. Usually 10% of K. 56 - `model`: Callable taking theta as argument and returning phi = M(theta). 57 - `ntheta`: Number of inputs to the model (parameters). 58 - `nphi`: Number of outputs of the model (State-variables) 59 - `verbose`: Boolean: whether to show more information about the computations 60 - `viz`: Boolean. Wether to show graphical outputs of the fitting process 61 """ 62 self.K = K 63 self.L = L 64 self.verbose = verbose 65 self.model = model 66 self.likelist = [] #list of likelihoods 67 self.q1theta = recarray(K,formats=['f8']*ntheta) #Theta Priors (record array) 68 self.post_theta = recarray(L,formats=['f8']*ntheta) #Theta Posteriors (record array) 69 self.q2phi = recarray(K,formats=['f8']*nphi) #Phi Priors (record array) 70 self.phi = recarray(K,formats=['f8']*nphi) #Phi model-induced Priors (record array) 71 self.q2type = [] #list of distribution types 72 self.post_phi = recarray(L,formats=['f8']*nphi) #Phi Posteriors (record array) 73 self.ntheta = ntheta 74 self.nphi = nphi 75 self.alpha = alpha #pooling weight of user-provided phi priors 76 self.done_running = False 77 if Viz: #Gnuplot installed 78 self.viz = viz 79 else: 80 self.viz = False
81 # self.po = Pool() #pool of processes for parallel processing 82
83 - def setPhi(self, names, dists=[stats.norm], pars=[(0, 1)], limits=[(-5,5)]):
84 """ 85 Setup the models Outputs, or Phi, and generate the samples from prior distributions 86 needed for the melding replicates. 87 88 :Parameters: 89 - `names`: list of string with the names of the variables. 90 - `dists`: is a list of RNG from scipy.stats 91 - `pars`: is a list of tuples of variables for each prior distribution, respectively. 92 - `limits`: lower and upper limits on the support of variables. 93 """ 94 if len(names) != self.nphi: 95 raise ValueError("Number of names(%s) does not match the number of output variables(%s)."%(len(names),self.nphi)) 96 self.q2phi.dtype.names = names 97 self.phi.dtype.names = names 98 self.post_phi.dtype.names = names 99 self.plimits = limits 100 for n,d,p in zip(names,dists,pars): 101 self.q2phi[n] = lhs.lhs(d,p,self.K).ravel() 102 self.q2type.append(d.name)
103 104 105
106 - def setTheta(self, names, dists=[stats.norm], pars=[(0, 1)]):
107 """ 108 Setup the models inputs and generate the samples from prior distributions 109 needed for the dists the melding replicates. 110 111 :Parameters: 112 - `names`: list of string with the names of the parameters. 113 - `dists`: is a list of RNG from scipy.stats 114 - `pars`: is a list of tuples of parameters for each prior distribution, respectivelydists 115 """ 116 self.q1theta.dtype.names = names 117 self.post_theta.dtype.names = names 118 if os.path.exists('q1theta'): 119 self.q1theta = CP.load(open('q1theta','r')) 120 else: 121 for n,d,p in zip(names,dists,pars): 122 self.q1theta[n] = lhs.lhs(d,p,self.K).ravel()
123
124 - def setThetaFromData(self,names,data, limits):
125 """ 126 Setup the model inputs and set the prior distributions from the vectors 127 in data. 128 This method is to be used when the prior distributions are available in 129 the form of a sample from an empirical distribution such as a bayesian 130 posterior. 131 In order to expand the samples provided, K samples are generated from a 132 kernel density estimate of the original sample. 133 134 :Parameters: 135 - `names`: list of string with the names of the parameters. 136 - `data`: list of vectors. Samples of a proposed distribution 137 - `limits`: List of (min,max) tuples for each theta to make sure samples are not generated outside these limits. 138 """ 139 self.q1theta.dtype.names = names 140 self.post_theta.dtype.names = names 141 if os.path.exists('q1theta'): 142 self.q1theta = CP.load(open('q1theta','r')) 143 else: 144 i = 0 145 for n,d in zip(names,data): 146 smp = [] 147 while len(smp)<self.K: 148 try: 149 smp += [x for x in gaussian_kde(d).resample(self.K)[0] if x >= limits[i][0] and x <= limits[i][1]] 150 except: 151 print d 152 sys.exit() 153 #print self.q1theta[n].shape, array(smp[:self.K]).shape 154 self.q1theta[n] = array(smp[:self.K]) 155 i += 1
156 # 157
158 - def setPhiFromData(self,names,data,limits):
159 """ 160 Setup the model outputs and set their prior distributions from the 161 vectors in data. 162 This method is to be used when the prior distributions are available in 163 the form of a sample from an empirical distribution such as a bayesian 164 posterior. 165 In order to expand the samples provided, K samples are generated from a 166 kernel density estimate of the original sample. 167 168 :Parameters: 169 - `names`: list of string with the names of the variables. 170 - `data`: list of vectors. Samples of the proposed distribution. 171 - `limits`: list of tuples (ll,ul),lower and upper limits on the support of variables. 172 """ 173 self.q2phi.dtype.names = names 174 self.phi.dtype.names = names 175 self.post_phi.dtype.names = names 176 self.limits = limits 177 for n,d in zip(names,data): 178 i = 0 179 smp = [] 180 while len(smp)<self.K: 181 smp += [x for x in gaussian_kde(d).resample(self.K)[0] if x >= limits[i][0] and x <= limits[i][1]] 182 self.q2phi[n] = array(smp[:self.K]) 183 self.q2type.append('empirical') 184 i += 1
185 #self.q2phi = self.filtM(self.q2phi, self.q2phi, limits) 186 187
188 - def run(self,*args):
189 """ 190 Runs the model through the Melding inference.model 191 model is a callable which return the output of the deterministic model, 192 i.e. the model itself. 193 The model is run self.K times to obtain phi = M(theta). 194 """ 195 196 for i in xrange(self.K): 197 theta = [self.q1theta[n][i] for n in self.q1theta.dtype.names] 198 r = self.po.apply_async(self.model, theta) 199 self.phi[i]= r.get()[-1]#self.model(*theta)[-1] #phi is the last point in the simulation 200 201 self.done_running = True
202
203 - def getPosteriors(self,t=1):
204 """ 205 Updates the posteriors of the model's output for the last t time steps. 206 Returns two record arrays: 207 - The posteriors of the Theta 208 - the posterior of Phi last t values of time-series. self.L by `t` arrays. 209 210 :Parameters: 211 - `t`: length of the posterior time-series to return. 212 """ 213 if not self.done_running: 214 return 215 if t > 1: 216 self.post_phi = recarray((self.L,t),formats=['f8']*self.nphi) 217 self.post_phi.dtype.names = self.phi.dtype.names 218 def cb(r): 219 ''' 220 callback function for the asynchronous model runs. 221 r: tuple with results of simulatio (results, run#) 222 ''' 223 if t == 1: 224 self.post_phi[r[1]] = (r[0][-1],) 225 else: 226 self.post_phi[r[1]]= [tuple(l) for l in r[0][-t:]]
227 po = Pool() 228 #random indices for the marginal posteriors of theta 229 pti = lhs.lhs(stats.randint,(0,self.L),siz=(self.ntheta,self.L)) 230 for i in xrange(self.L):#Monte Carlo with values of the posterior of Theta 231 theta = [self.post_theta[n][pti[j,i]] for j,n in enumerate(self.post_theta.dtype.names)] 232 po.apply_async(enumRun, (self.model,theta,i), callback=cb) 233 # r = po.apply_async(self.model,theta) 234 # if t == 1: 235 # self.post_phi[i] = r.get()[-1] 236 # else: 237 # self.post_phi[i]= [tuple(l) for l in r.get()[-t:]] 238 if i%100 == 0 and self.verbose: 239 print "==> L = %s"%i 240 241 po.close() 242 po.join() 243 return self.post_theta, self.post_phi
244
245 - def filtM(self,cond,x,limits):
246 ''' 247 Multiple condition filtering. 248 Remove values in x[i], if corresponding values in 249 cond[i] are less than limits[i][0] or greater than 250 limits[i][1]. 251 252 :Parameters: 253 - `cond`: is an array of conditions. 254 - `limits`: is a list of tuples (ll,ul) with length equal to number of lines in `cond` and `x`. 255 - `x`: array to be filtered. 256 ''' 257 # Deconstruct the record array, if necessary. 258 names = [] 259 if isinstance(cond, recarray): 260 names = list(cond.dtype.names) 261 cond = [cond[v] for v in cond.dtype.names] 262 x = [x[v] for v in x.dtype.names] 263 264 cond = array(cond) 265 cnd = ones(cond.shape[1],int) 266 for i,j in zip(cond,limits): 267 ll = j[0] 268 ul = j[1] 269 #print cond.shape,cnd.shape,i.shape,ll,ul 270 cnd = cnd & less(i,ul) & greater(i,ll) 271 f = compress(cnd,x, axis=1) 272 273 if names:#Reconstruct the record array 274 r = recarray((1,f.shape[1]),formats=['f8']*len(names),names=names) 275 for i,n in enumerate(names): 276 r[n]=f[i] 277 f=r 278 279 return f
280
281 - def basicfit(self,s1,s2):
282 ''' 283 Calculates a basic fitness calculation between a model- 284 generated time series and a observed time series. 285 it uses a normalized RMS variation. 286 287 :Parameters: 288 - `s1`: model-generated time series. record array. 289 - `s2`: observed time series. dictionary with keys matching names of s1 290 291 :Return: 292 Root mean square deviation between ´s1´ and ´s2´. 293 ''' 294 fit = [] 295 for k in s2.keys(): 296 if s2[k] == [] or (not s2[k].any()): 297 continue #no observations for this variable 298 e = numpy.sqrt(mean((s1[k]-s2[k])**2.)) 299 fit.append(e) #min to guarantee error is bounded to (0,1) 300 301 return mean(fit) #mean r-squared
302 303
304 - def logPooling(self,phi):
305 """ 306 Returns the probability associated with each phi[i] 307 on the pooled pdf of phi and q2phi. 308 309 :Parameters: 310 - `phi`: prior of Phi induced by the model and q1theta. 311 """ 312 313 # Estimating the multivariate joint probability densities 314 phidens = gaussian_kde(array([phi[n][:,-1] for n in phi.dtype.names])) 315 316 q2dens = gaussian_kde(array([self.q2phi[n] for n in self.q2phi.dtype.names])) 317 # Determining the pooled probabilities for each phi[i] 318 # qtilphi = zeros(self.K) 319 lastp = array([list(phi[i,-1]) for i in xrange(self.K)]) 320 # print lastp,lastp.shape 321 qtilphi = (phidens.evaluate(lastp.T)**(1-self.alpha))*q2dens.evaluate(lastp.T)**self.alpha 322 return qtilphi/sum(qtilphi)
323
324 - def abcRun(self,fitfun=None, data={}, t=1,nopool=False,savetemp=False):
325 """ 326 Runs the model for inference through Approximate Bayes Computation 327 techniques. This method should be used as an alternative to the sir. 328 329 :Parameters: 330 - `fitfun`: Callable which will return the goodness of fit of the model to data as a number between 0-1, with 1 meaning perfect fit 331 - `t`: number of time steps to retain at the end of the of the model run for fitting purposes. 332 - `data`: dict containing observed time series (lists of length t) of the state variables. This dict must have as many items the number of state variables, with labels matching variables names. Unorbserved variables must have an empty list as value. 333 - `savetemp`: Should temp results be saved. Useful for long runs. Alows for resuming the simulation from last sa 334 """ 335 seed() 336 if not fitfun: 337 fitfun = self.basicfit 338 if savetemp: 339 CP.dump(self.q1theta,open('q1theta','w')) 340 # Running the model ========================== 341 phi = self.runModel(savetemp,t) 342 343 print "==> Done Running the K replicates\n" 344 # Do Log Pooling 345 if nopool: 346 qtilphi = ones(self.K) 347 else: 348 t0 = time() 349 qtilphi = self.logPooling(phi) #vector with probability of each phi[i] belonging to qtilphi 350 print "==> Done Running the Log Pooling (took %s seconds)\n"%(time()-t0) 351 qtilphi = nan_to_num(qtilphi) 352 #print 'max(qtilphi): ', max(qtilphi) 353 if sum(qtilphi)==0: 354 print 'Pooled prior on ouputs is null, please check your priors, and try again.' 355 return 0 356 # 357 # calculate weights 358 w = [fitfun(phi[i],data) for i in xrange(phi.shape[0])] 359 w /=sum(w) 360 w = 1-w 361 #print "w=",w, mean(w), var(w) 362 # print 363 # print 'qtilphi=',qtilphi 364 # Resampling Thetas 365 w = nan_to_num(w) 366 w = array(w)*qtilphi 367 w /=sum(w) 368 w = nan_to_num(w) 369 print 'max(w): %s\nmean(w): %s\nvar(w): %s'%(max(w), mean(w), var(w)) 370 # for n in phi.dtype.names: 371 # P.plot(mean(phi[n],axis=0),label=n) 372 # P.figure() 373 # P.plot(w,label='w') 374 # P.plot(qtilphi,label='qtilphi') 375 # P.title('Resampling vector(w) and pooled prior on Phi') 376 # P.legend() 377 if sum(w) == 0.0: 378 print 'Resampling weights are all zero, please check your model or data.' 379 return 0 380 t0 = time() 381 j = 0 382 while j < self.L: # Extract L samples from q1theta 383 i=randint(0,w.size)# Random position of w and q1theta 384 if random()<= w[i]: 385 self.post_theta[j] = self.q1theta[i]# retain the sample according with resampling prob. 386 j+=1 387 print "==> Done Resampling (L=%s) priors (took %s seconds)"%(self.L,(time()-t0)) 388 389 self.done_running = True 390 return 1
391
392 - def sir(self, data={}, t=1,tau=0.1, nopool=False,savetemp=False):
393 """ 394 Run the model output through the Sampling-Importance-Resampling algorithm. 395 Returns 1 if successful or 0 if not. 396 397 :Parameters: 398 - `data`: observed time series on the model's output 399 - `t`: length of the observed time series 400 - `tau`: Precision of the Normal likelihood function 401 - `nopool`: True if no priors on the outputs are available. Leads to faster calculations 402 - `savetemp`: Boolean. create a temp file? 403 """ 404 seed() 405 phi = self.runModel(savetemp,t) 406 # Do Log Pooling 407 if nopool: 408 qtilphi = ones(self.K) 409 else: 410 t0 = time() 411 qtilphi = self.logPooling(phi) #vector with probability of each phi[i] belonging to qtilphi 412 print "==> Done Running the Log Pooling (took %s seconds)\n"%(time()-t0) 413 qtilphi = nan_to_num(qtilphi) 414 print 'max(qtilphi): ', max(qtilphi) 415 if sum(qtilphi)==0: 416 print 'Pooled prior on ouputs is null, please check your priors, and try again.' 417 return 0 418 419 # Calculating the likelihood of each phi[i] considering the observed data 420 lik = zeros(self.K) 421 t0=time() 422 # po = Pool() 423 for i in xrange(self.K): 424 l=1 425 for n in data.keys(): 426 if isinstance(data[n],list) and data[n] == []: 427 continue #no observations for this variable 428 elif isinstance(data[n],numpy.ndarray) and (not data[n].any()): 429 continue #no observations for this variable 430 p = phi[n] 431 432 # liklist=[po.apply_async(like.Normal,(data[n][m], j, tau)) for m,j in enumerate(p[i])] 433 # l=product([p.get() for p in liklist]) 434 l *= product([exp(like.Normal(data[n][m], j,1./(tau))) for m,j in enumerate(p[i])]) 435 #l += sum([like.Normal(data[n][m], j,1./(tau*j+.0001)) for m,j in enumerate(p[i])]) 436 437 lik[i]=l 438 # po.close() 439 # po.join() 440 if self.viz: 441 dtplot.clearFig();phiplot.clearFig();thplot.clearFig() 442 dtplot.gp.xlabel('observed') 443 dtplot.gp.ylabel('simulated') 444 obs = [];sim =[] 445 for n in data.keys(): 446 obs.append(data[n]) 447 sim.append(phi[n].mean(axis=0).tolist()) 448 dtplot.scatter(array(obs),array(sim),names=data.keys(),title='fit') 449 phiplot.plotlines(array(sim),names=data.keys(),title='Model Output') 450 # thplot.plothist(self.q1theta, title='Input parameters',names=self.q1theta.dtype.names) 451 print "==> Done Calculating Likelihoods (took %s seconds)"%(time()-t0) 452 lr = nan_to_num(max(lik)/min(lik)) 453 print '==> Likelihood (min,mean,max,sum): ',min(lik),mean(lik),max(lik), sum(lik) 454 print "==> Likelihood ratio of best run/worst run: %s"%(lr,) 455 # Calculating the weights 456 w = nan_to_num(qtilphi*lik) 457 w = nan_to_num(w/sum(w)) 458 459 if not sum(w) == 0.0: 460 j = 0 461 t0 = time() 462 maxw = 0;minw = max(w) #keep track of goodness of fit of phi 463 while j < self.L: # Extract L samples from q1theta 464 i=randint(0,w.size)# Random position of w and q1theta 465 if random()*max(w)<= w[i]: 466 self.post_theta[j] = self.q1theta[i]# retain the sample according with resampling prob. 467 maxw = max(maxw,w[i]) 468 minw = min(minw,w[i]) 469 j+=1 470 if not j%100 and self.verbose: 471 print j, "of %s"%self.L 472 self.done_running = True 473 print "==> Done Resampling (L=%s) priors (took %s seconds)"%(self.L,(time()-t0)) 474 wr = maxw/minw 475 print "==> Likelihood ratio of best/worst retained runs: %s"%(wr,) 476 if wr == 1: 477 print "==> Flat likelihood, trying again..." 478 return 0 479 print "==> Improvement: %s percent"%(100-100*wr/lr,) 480 else: 481 print 'Resampling weights are all zero, please check your model or data, and try again.\n' 482 print '==> Likelihood (min,mean,max): ',min(lik),mean(lik),max(lik) 483 print '==> RMS deviation of outputs: %s'%(self.basicfit(phi, data),) 484 return 0 485 return 1
486
487 - def runModel(self,savetemp,t=1):
488 ''' 489 Handles running the model self.K times keeping a temporary savefile for 490 resuming calculation in case of interruption. 491 492 :Parameters: 493 - `savetemp`: Boolean. create a temp file? 494 ''' 495 if savetemp: 496 CP.dump(self.q1theta,open('q1theta','w')) 497 # Running the model ========================== 498 499 500 if os.path.exists('phi.temp'): 501 phi,j = CP.load(open('phi.temp','r')) 502 else: 503 j=0 504 phi = recarray((self.K,t),formats=['f8']*self.nphi, names = self.phi.dtype.names) 505 def cb(r): 506 ''' 507 callback function for the asynchronous model runs 508 ''' 509 if t == 1: 510 phi[r[1]] = (r[0][-1],) 511 else: 512 phi[r[1]] = [tuple(l) for l in r[0][-t:]]# #phi is the last t points in the simulation
513 514 po = Pool() 515 t0=time() 516 for i in xrange(j,self.K): 517 theta = [self.q1theta[n][i] for n in self.q1theta.dtype.names] 518 r = po.apply_async(enumRun,(self.model,theta,i),callback=cb) 519 # r = po.apply_async(self.model,theta) 520 # if t == 1: 521 # phi[i] = (r.get()[-1],) 522 # else: 523 # phi[i] = [tuple(l) for l in r.get()[-t:]]# #phi is the last t points in the simulation 524 if i%100 == 0 and self.verbose: 525 print "==> K = %s"%i 526 if savetemp: 527 CP.dump((phi,i),open('phi.temp','w')) 528 if savetemp: #If all replicates are done, clear temporary save files. 529 os.unlink('phi.temp') 530 os.unlink('q1theta') 531 po.close() 532 po.join() 533 print "==> Done Running the K (%s) replicates (took %s seconds)\n"%(self.K,(time()-t0)) 534 535 return phi
536 -def enumRun(model,theta,k):
537 """ 538 Returns model results plus run number. 539 540 :Parameters: 541 - `model`: model callable 542 - `theta`: model input list 543 - `k`: run number 544 545 :Return: 546 - res: result list 547 - `k`: run number 548 """ 549 res =model(*theta) 550 return (res,k)
551
552 -def model(r, p0, n=1):
553 """ 554 Model (r,p0, n=1) 555 Simulates the Population dynamic Model (PDM) Pt = rP0 556 for n time steps. 557 P0 is the initial population size. 558 Example model for testing purposes. 559 """ 560 # print "oi" 561 Pt = zeros(n, float) # initialize the output vector 562 P = p0 563 for i in xrange(n): 564 Pt[i] = r*P 565 P = Pt[i] 566 567 return Pt
568 569
570 -def plotRaHist(arr):
571 ''' 572 Plots a record array 573 as a panel of histograms 574 ''' 575 nv = len(arr.dtype.names) 576 fs = (numpy.ceil(numpy.sqrt(nv)),numpy.floor(numpy.sqrt(nv))+1) #figure size 577 P.figure() 578 for i,n in enumerate(arr.dtype.names): 579 P.subplot(nv/2+1,2,i+1) 580 P.hist(arr[n],bins=50, normed=1, label=n) 581 P.legend()
582 583
584 -def main2():
585 start = time() 586 Me = Meld(K=10000,L=2000,model=model, ntheta=2,nphi=1,verbose=False,viz=False) 587 Me.setTheta(['r','p0'],[stats.uniform,stats.uniform],[(2,4),(0,5)]) 588 Me.setPhi(['p'],[stats.uniform],[(6,9)],[(6,9)]) 589 #Me.addData(normal(7.5,1,400),'normal',(6,9)) 590 #Me.run() 591 Me.sir(data ={'p':[7.5]} ) 592 pt,pp = Me.getPosteriors() 593 end = time() 594 plotRaHist(pt) 595 plotRaHist(pp) 596 P.show() 597 print end-start, ' seconds'
598 if Viz: 599 dtplot = RTplot();phiplot = RTplot();thplot = RTplot() 600 if __name__ == '__main__': 601 602 main2() 603