Package BIP :: Package Bayes :: Module Melding
[hide private]
[frames] | no frames]

Source Code for Module BIP.Bayes.Melding

  1  # -*- coding:utf-8 -*- 
  2  #----------------------------------------------------------------------------- 
  3  # Name:        Melding.py 
  4  # Purpose:     The Bayesian melding Class provides 
  5  #              uncertainty analyses for simulation models. 
  6  # 
  7  # Author:      Flávio Codeço Coelho 
  8  # 
  9  # Created:     2003/08/10 
 10  # Copyright:   (c) 2003-2008 by the Author 
 11  # Licence:     GPL 
 12  #----------------------------------------------------------------------------- 
 13  from numpy.core.records import recarray 
 14  try: 
 15      import psyco 
 16      psyco.full() 
 17  except: 
 18      pass 
 19  import sys 
 20  import os 
 21  import cPickle as CP 
 22  import like 
 23  import pylab as P 
 24  from scipy.stats.kde import gaussian_kde 
 25  from scipy.linalg import LinAlgError 
 26  from scipy import stats 
 27  import numpy 
 28  from numpy import array, nan_to_num, zeros, product, exp, ones,mean, var 
 29  from time import time 
 30  from numpy.random import normal, randint,  random, seed 
 31  try: 
 32      from BIP.Viz.realtime import RTplot 
 33      Viz=True 
 34  except: 
 35      Viz=False 
 36      print r"""Please install Gnuplot-py to enable realtime visualization. 
 37      http://gnuplot-py.sourceforge.net/ 
 38      """ 
 39  import lhs 
 40   
 41  from multiprocessing import Pool 
 42  if Viz: 
 43      dtplot = RTplot();phiplot = RTplot();thplot = RTplot() 
 44   
 45  __docformat__ = "restructuredtext en" 
 46   
 47   
48 -class Meld:
49 """ 50 Bayesian Melding class 51 """
52 - def __init__(self, K, L, model, ntheta, nphi, alpha = 0.5, verbose = False, viz=False ):
53 """ 54 Initializes the Melding class. 55 56 :Parameters: 57 - `K`: Number of replicates of the model run. Also determines the prior sample size. 58 - `L`: Number of samples from the Posterior distributions. Usually 10% of K. 59 - `model`: Callable taking theta as argument and returning phi = M(theta). 60 - `ntheta`: Number of inputs to the model (parameters). 61 - `nphi`: Number of outputs of the model (State-variables) 62 - `verbose`: Boolean: whether to show more information about the computations 63 - `viz`: Boolean. Wether to show graphical outputs of the fitting process 64 """ 65 self.K = K 66 self.L = L 67 self.verbose = verbose 68 self.model = model 69 self.likelist = [] #list of likelihoods 70 self.q1theta = recarray(K,formats=['f8']*ntheta) #Theta Priors (record array) 71 self.post_theta = recarray(L,formats=['f8']*ntheta) #Theta Posteriors (record array) 72 self.q2phi = recarray(K,formats=['f8']*nphi) #Phi Priors (record array) 73 self.phi = recarray(K,formats=['f8']*nphi) #Phi model-induced Priors (record array) 74 self.q2type = [] #list of distribution types 75 self.post_phi = recarray(L,formats=['f8']*nphi) #Phi Posteriors (record array) 76 self.ntheta = ntheta 77 self.nphi = nphi 78 self.alpha = alpha #pooling weight of user-provided phi priors 79 self.done_running = False 80 if Viz: #Gnuplot installed 81 self.viz = viz 82 else: 83 self.viz = False
84 # self.po = Pool() #pool of processes for parallel processing 85
86 - def setPhi(self, names, dists=[stats.norm], pars=[(0, 1)], limits=[(-5,5)]):
87 """ 88 Setup the models Outputs, or Phi, and generate the samples from prior distributions 89 needed for the melding replicates. 90 91 :Parameters: 92 - `names`: list of string with the names of the variables. 93 - `dists`: is a list of RNG from scipy.stats 94 - `pars`: is a list of tuples of variables for each prior distribution, respectively. 95 - `limits`: lower and upper limits on the support of variables. 96 """ 97 if len(names) != self.nphi: 98 raise ValueError("Number of names(%s) does not match the number of output variables(%s)."%(len(names),self.nphi)) 99 self.q2phi.dtype.names = names 100 self.phi.dtype.names = names 101 self.post_phi.dtype.names = names 102 self.plimits = limits 103 for n,d,p in zip(names,dists,pars): 104 self.q2phi[n] = lhs.lhs(d,p,self.K).ravel() 105 self.q2type.append(d.name)
106 107 108
109 - def setTheta(self, names, dists=[stats.norm], pars=[(0, 1)]):
110 """ 111 Setup the models inputs and generate the samples from prior distributions 112 needed for the dists the melding replicates. 113 114 :Parameters: 115 - `names`: list of string with the names of the parameters. 116 - `dists`: is a list of RNG from scipy.stats 117 - `pars`: is a list of tuples of parameters for each prior distribution, respectivelydists 118 """ 119 self.q1theta.dtype.names = names 120 self.post_theta.dtype.names = names 121 if os.path.exists('q1theta'): 122 self.q1theta = CP.load(open('q1theta','r')) 123 else: 124 for n,d,p in zip(names,dists,pars): 125 self.q1theta[n] = lhs.lhs(d,p,self.K).ravel()
126
127 - def setThetaFromData(self,names,data, limits):
128 """ 129 Setup the model inputs and set the prior distributions from the vectors 130 in data. 131 This method is to be used when the prior distributions are available in 132 the form of a sample from an empirical distribution such as a bayesian 133 posterior. 134 In order to expand the samples provided, K samples are generated from a 135 kernel density estimate of the original sample. 136 137 :Parameters: 138 - `names`: list of string with the names of the parameters. 139 - `data`: list of vectors. Samples of a proposed distribution 140 - `limits`: List of (min,max) tuples for each theta to make sure samples are not generated outside these limits. 141 """ 142 self.q1theta.dtype.names = names 143 self.post_theta.dtype.names = names 144 if os.path.exists('q1theta'): 145 self.q1theta = CP.load(open('q1theta','r')) 146 else: 147 i = 0 148 for n,d in zip(names,data): 149 smp = [] 150 while len(smp)<self.K: 151 smp += [x for x in gaussian_kde(d).resample(self.K)[0] if x >= limits[i][0] and x <= limits[i][1]] 152 #print self.q1theta[n].shape, array(smp[:self.K]).shape 153 self.q1theta[n] = array(smp[:self.K]) 154 i += 1
155 # 156
157 - def setPhiFromData(self,names,data,limits):
158 """ 159 Setup the model outputs and set their prior distributions from the 160 vectors in data. 161 This method is to be used when the prior distributions are available in 162 the form of a sample from an empirical distribution such as a bayesian 163 posterior. 164 In order to expand the samples provided, K samples are generated from a 165 kernel density estimate of the original sample. 166 167 :Parameters: 168 - `names`: list of string with the names of the variables. 169 - `data`: list of vectors. Samples of the proposed distribution. 170 - `limits`: list of tuples (ll,ul),lower and upper limits on the support of variables. 171 """ 172 self.q2phi.dtype.names = names 173 self.phi.dtype.names = names 174 self.post_phi.dtype.names = names 175 self.limits = limits 176 for n,d in zip(names,data): 177 i = 0 178 smp = [] 179 while len(smp)<self.K: 180 try: 181 smp += [x for x in gaussian_kde(d).resample(self.K)[0] if x >= limits[i][0] and x <= limits[i][1]] 182 except: 183 #d is has no variation, i.e., all elements are the same 184 #print d 185 #raise LinAlgError, "Singular matrix" 186 smp = ones(self.K)*d[0] #in this case return a constant sample 187 self.q2phi[n] = array(smp[:self.K]) 188 self.q2type.append('empirical') 189 i += 1
190 #self.q2phi = self.filtM(self.q2phi, self.q2phi, limits) 191 192
193 - def run(self,*args):
194 """ 195 Runs the model through the Melding inference.model 196 model is a callable which return the output of the deterministic model, 197 i.e. the model itself. 198 The model is run self.K times to obtain phi = M(theta). 199 """ 200 201 for i in xrange(self.K): 202 theta = [self.q1theta[n][i] for n in self.q1theta.dtype.names] 203 r = self.po.apply_async(self.model, theta) 204 self.phi[i]= r.get()[-1]#self.model(*theta)[-1] #phi is the last point in the simulation 205 206 self.done_running = True
207
208 - def getPosteriors(self,t):
209 """ 210 Updates the posteriors of the model's output for the last t time steps. 211 Returns two record arrays: 212 - The posteriors of the Theta 213 - the posterior of Phi last t values of time-series. self.L by `t` arrays. 214 215 :Parameters: 216 - `t`: length of the posterior time-series to return. 217 """ 218 if not self.done_running: 219 return 220 if t > 1: 221 self.post_phi = recarray((self.L,t),formats=['f8']*self.nphi) 222 self.post_phi.dtype.names = self.phi.dtype.names 223 def cb(r): 224 ''' 225 callback function for the asynchronous model runs. 226 r: tuple with results of simulation (results, run#) 227 ''' 228 if t == 1: 229 self.post_phi[r[1]] = (r[0][-1],) 230 #self.post_phi[r[1]]= [tuple(l) for l in r[0][-t:]] 231 else: 232 self.post_phi[r[1]]= [tuple(l) for l in r[0][-t:]]
233 po = Pool() 234 #random indices for the marginal posteriors of theta 235 pti = lhs.lhs(stats.randint,(0,self.L),siz=(self.ntheta,self.L)) 236 for i in xrange(self.L):#Monte Carlo with values of the posterior of Theta 237 theta = [self.post_theta[n][pti[j,i]] for j,n in enumerate(self.post_theta.dtype.names)] 238 po.apply_async(enumRun, (self.model,theta,i), callback=cb) 239 # r = po.apply_async(self.model,theta) 240 # if t == 1: 241 # self.post_phi[i] = r.get()[-1] 242 # else: 243 # self.post_phi[i]= [tuple(l) for l in r.get()[-t:]] 244 if i%100 == 0 and self.verbose: 245 print "==> L = %s"%i 246 247 po.close() 248 po.join() 249 return self.post_theta, self.post_phi
250
251 - def filtM(self,cond,x,limits):
252 ''' 253 Multiple condition filtering. 254 Remove values in x[i], if corresponding values in 255 cond[i] are less than limits[i][0] or greater than 256 limits[i][1]. 257 258 :Parameters: 259 - `cond`: is an array of conditions. 260 - `limits`: is a list of tuples (ll,ul) with length equal to number of lines in `cond` and `x`. 261 - `x`: array to be filtered. 262 ''' 263 # Deconstruct the record array, if necessary. 264 names = [] 265 if isinstance(cond, recarray): 266 names = list(cond.dtype.names) 267 cond = [cond[v] for v in cond.dtype.names] 268 x = [x[v] for v in x.dtype.names] 269 270 cond = array(cond) 271 cnd = ones(cond.shape[1],int) 272 for i,j in zip(cond,limits): 273 ll = j[0] 274 ul = j[1] 275 #print cond.shape,cnd.shape,i.shape,ll,ul 276 cnd = cnd & less(i,ul) & greater(i,ll) 277 f = compress(cnd,x, axis=1) 278 279 if names:#Reconstruct the record array 280 r = recarray((1,f.shape[1]),formats=['f8']*len(names),names=names) 281 for i,n in enumerate(names): 282 r[n]=f[i] 283 f=r 284 285 return f
286
287 - def basicfit(self,s1,s2):
288 ''' 289 Calculates a basic fitness calculation between a model- 290 generated time series and a observed time series. 291 it uses a normalized RMS variation. 292 293 :Parameters: 294 - `s1`: model-generated time series. record array. 295 - `s2`: observed time series. dictionary with keys matching names of s1 296 297 :Return: 298 Root mean square deviation between ´s1´ and ´s2´. 299 ''' 300 fit = [] 301 for k in s2.keys(): 302 if s2[k] == [] or (not s2[k].any()): 303 continue #no observations for this variable 304 e = numpy.sqrt(mean((s1[k]-s2[k])**2.)) 305 fit.append(e) #min to guarantee error is bounded to (0,1) 306 307 return mean(fit) #mean r-squared
308 309
310 - def logPooling(self,phi):
311 """ 312 Returns the probability associated with each phi[i] 313 on the pooled pdf of phi and q2phi. 314 315 :Parameters: 316 - `phi`: prior of Phi induced by the model and q1theta. 317 """ 318 319 # Estimating the multivariate joint probability densities 320 phidens = gaussian_kde(array([phi[n][:,-1] for n in phi.dtype.names])) 321 322 q2dens = gaussian_kde(array([self.q2phi[n] for n in self.q2phi.dtype.names])) 323 # Determining the pooled probabilities for each phi[i] 324 # qtilphi = zeros(self.K) 325 lastp = array([list(phi[i,-1]) for i in xrange(self.K)]) 326 # print lastp,lastp.shape 327 qtilphi = (phidens.evaluate(lastp.T)**(1-self.alpha))*q2dens.evaluate(lastp.T)**self.alpha 328 return qtilphi/sum(qtilphi)
329
330 - def abcRun(self,fitfun=None, data={}, t=1,nopool=False,savetemp=False):
331 """ 332 Runs the model for inference through Approximate Bayes Computation 333 techniques. This method should be used as an alternative to the sir. 334 335 :Parameters: 336 - `fitfun`: Callable which will return the goodness of fit of the model to data as a number between 0-1, with 1 meaning perfect fit 337 - `t`: number of time steps to retain at the end of the of the model run for fitting purposes. 338 - `data`: dict containing observed time series (lists of length t) of the state variables. This dict must have as many items the number of state variables, with labels matching variables names. Unorbserved variables must have an empty list as value. 339 - `savetemp`: Should temp results be saved. Useful for long runs. Alows for resuming the simulation from last sa 340 """ 341 seed() 342 if not fitfun: 343 fitfun = self.basicfit 344 if savetemp: 345 CP.dump(self.q1theta,open('q1theta','w')) 346 # Running the model ========================== 347 phi = self.runModel(savetemp,t) 348 349 print "==> Done Running the K replicates\n" 350 # Do Log Pooling 351 if nopool: 352 qtilphi = ones(self.K) 353 else: 354 t0 = time() 355 qtilphi = self.logPooling(phi) #vector with probability of each phi[i] belonging to qtilphi 356 print "==> Done Running the Log Pooling (took %s seconds)\n"%(time()-t0) 357 qtilphi = nan_to_num(qtilphi) 358 #print 'max(qtilphi): ', max(qtilphi) 359 if sum(qtilphi)==0: 360 print 'Pooled prior on ouputs is null, please check your priors, and try again.' 361 return 0 362 # 363 # calculate weights 364 w = [fitfun(phi[i],data) for i in xrange(phi.shape[0])] 365 w /=sum(w) 366 w = 1-w 367 #print "w=",w, mean(w), var(w) 368 # print 369 # print 'qtilphi=',qtilphi 370 # Resampling Thetas 371 w = nan_to_num(w) 372 w = array(w)*qtilphi 373 w /=sum(w) 374 w = nan_to_num(w) 375 print 'max(w): %s\nmean(w): %s\nvar(w): %s'%(max(w), mean(w), var(w)) 376 # for n in phi.dtype.names: 377 # P.plot(mean(phi[n],axis=0),label=n) 378 # P.figure() 379 # P.plot(w,label='w') 380 # P.plot(qtilphi,label='qtilphi') 381 # P.title('Resampling vector(w) and pooled prior on Phi') 382 # P.legend() 383 if sum(w) == 0.0: 384 print 'Resampling weights are all zero, please check your model or data.' 385 return 0 386 t0 = time() 387 j = 0 388 while j < self.L: # Extract L samples from q1theta 389 i=randint(0,w.size)# Random position of w and q1theta 390 if random()<= w[i]: 391 self.post_theta[j] = self.q1theta[i]# retain the sample according with resampling prob. 392 j+=1 393 print "==> Done Resampling (L=%s) priors (took %s seconds)"%(self.L,(time()-t0)) 394 395 self.done_running = True 396 return 1
397
398 - def sir(self, data={}, t=1,variance=0.1, nopool=False,savetemp=False):
399 """ 400 Run the model output through the Sampling-Importance-Resampling algorithm. 401 Returns 1 if successful or 0 if not. 402 403 :Parameters: 404 - `data`: observed time series on the model's output 405 - `t`: length of the observed time series 406 - `variance`: variance of the Normal likelihood function 407 - `nopool`: True if no priors on the outputs are available. Leads to faster calculations 408 - `savetemp`: Boolean. create a temp file? 409 """ 410 seed() 411 phi = self.runModel(savetemp,t) 412 # Do Log Pooling 413 if nopool: 414 qtilphi = ones(self.K) 415 else: 416 t0 = time() 417 qtilphi = self.logPooling(phi) #vector with probability of each phi[i] belonging to qtilphi 418 print "==> Done Running the Log Pooling (took %s seconds)\n"%(time()-t0) 419 qtilphi = nan_to_num(qtilphi) 420 print 'max(qtilphi): ', max(qtilphi) 421 if sum(qtilphi)==0: 422 print 'Pooled prior on ouputs is null, please check your priors, and try again.' 423 return 0 424 425 # Calculating the likelihood of each phi[i] considering the observed data 426 lik = zeros(self.K) 427 t0=time() 428 # po = Pool() 429 for i in xrange(self.K): 430 l=1 431 for n in data.keys(): 432 if isinstance(data[n],list) and data[n] == []: 433 continue #no observations for this variable 434 elif isinstance(data[n],numpy.ndarray) and (not data[n].any()): 435 continue #no observations for this variable 436 p = phi[n] 437 438 # liklist=[po.apply_async(like.Normal,(data[n][m], j, tau)) for m,j in enumerate(p[i])] 439 # l=product([p.get() for p in liklist]) 440 l *= product([exp(like.Normal(data[n][m], j,1./(variance))) for m,j in enumerate(p[i])]) 441 #l += sum([like.Normal(data[n][m], j,1./(tau*j+.0001)) for m,j in enumerate(p[i])]) 442 443 lik[i]=l 444 # po.close() 445 # po.join() 446 if self.viz: 447 dtplot.clearFig();phiplot.clearFig();thplot.clearFig() 448 dtplot.gp.xlabel('observed') 449 dtplot.gp.ylabel('simulated') 450 obs = [];sim =[] 451 for n in data.keys(): 452 obs.append(data[n]) 453 sim.append(phi[n].mean(axis=0).tolist()) 454 dtplot.scatter(array(obs),array(sim),names=data.keys(),title='fit') 455 phiplot.plotlines(array(sim),names=data.keys(),title='Model Output') 456 thplot.plothist(self.q1theta, title='Input parameters',names=self.q1theta.dtype.names) 457 print "==> Done Calculating Likelihoods (took %s seconds)"%(time()-t0) 458 lr = nan_to_num(max(lik)/min(lik)) 459 print '==> Likelihood (min,mean,max,sum): ',min(lik),mean(lik),max(lik), sum(lik) 460 print "==> Likelihood ratio of best run/worst run: %s"%(lr,) 461 # Calculating the weights 462 w = nan_to_num(qtilphi*lik) 463 w = nan_to_num(w/sum(w)) 464 465 if not sum(w) == 0.0: 466 j = 0 467 t0 = time() 468 maxw = 0;minw = max(w) #keep track of goodness of fit of phi 469 while j < self.L: # Extract L samples from q1theta 470 i=randint(0,w.size)# Random position of w and q1theta 471 if random()*max(w)<= w[i]: 472 self.post_theta[j] = self.q1theta[i]# retain the sample according with resampling prob. 473 maxw = max(maxw,w[i]) 474 minw = min(minw,w[i]) 475 j+=1 476 if not j%100 and self.verbose: 477 print j, "of %s"%self.L 478 self.done_running = True 479 print "==> Done Resampling (L=%s) priors (took %s seconds)"%(self.L,(time()-t0)) 480 wr = maxw/minw 481 print "==> Likelihood ratio of best/worst retained runs: %s"%(wr,) 482 if wr == 1: 483 print "==> Flat likelihood, trying again..." 484 return 0 485 print "==> Improvement: %s percent"%(100-100*wr/lr,) 486 else: 487 print 'Resampling weights are all zero, please check your model or data, and try again.\n' 488 print '==> Likelihood (min,mean,max): ',min(lik),mean(lik),max(lik) 489 print '==> RMS deviation of outputs: %s'%(self.basicfit(phi, data),) 490 return 0 491 return 1
492
493 - def runModel(self,savetemp,t=1):
494 ''' 495 Handles running the model self.K times keeping a temporary savefile for 496 resuming calculation in case of interruption. 497 498 :Parameters: 499 - `savetemp`: Boolean. create a temp file? 500 ''' 501 if savetemp: 502 CP.dump(self.q1theta,open('q1theta','w')) 503 # Running the model ========================== 504 505 506 if os.path.exists('phi.temp'): 507 self.phi,j = CP.load(open('phi.temp','r')) 508 else: 509 j=0 510 self.phi = recarray((self.K,t),formats=['f8']*self.nphi, names = self.phi.dtype.names) 511 def cb(r): 512 ''' 513 callback function for the asynchronous model runs 514 ''' 515 if t == 1: 516 self.phi[r[1]] = (r[0][-1],) 517 else: 518 self.phi[r[1]] = [tuple(l) for l in r[0][-t:]]# #phi is the last t points in the simulation
519 520 po = Pool() 521 t0=time() 522 for i in xrange(j,self.K): 523 theta = [self.q1theta[n][i] for n in self.q1theta.dtype.names] 524 r = po.apply_async(enumRun,(self.model,theta,i),callback=cb) 525 # r = po.apply_async(self.model,theta) 526 # if t == 1: 527 # phi[i] = (r.get()[-1],) 528 # else: 529 # phi[i] = [tuple(l) for l in r.get()[-t:]]# #phi is the last t points in the simulation 530 if i%100 == 0 and self.verbose: 531 print "==> K = %s"%i 532 if savetemp: 533 CP.dump((self.phi,i),open('phi.temp','w')) 534 if savetemp: #If all replicates are done, clear temporary save files. 535 os.unlink('phi.temp') 536 os.unlink('q1theta') 537 po.close() 538 po.join() 539 print "==> Done Running the K (%s) replicates (took %s seconds)\n"%(self.K,(time()-t0)) 540 541 return self.phi
542 -def enumRun(model,theta,k):
543 """ 544 Returns model results plus run number. 545 546 :Parameters: 547 - `model`: model callable 548 - `theta`: model input list 549 - `k`: run number 550 551 :Return: 552 - res: result list 553 - `k`: run number 554 """ 555 res =model(*theta) 556 return (res,k)
557
558 -def model(r, p0, n=1):
559 """ 560 Model (r,p0, n=1) 561 Simulates the Population dynamic Model (PDM) Pt = rP0 562 for n time steps. 563 P0 is the initial population size. 564 Example model for testing purposes. 565 """ 566 # print "oi" 567 Pt = zeros(n, float) # initialize the output vector 568 P = p0 569 for i in xrange(n): 570 Pt[i] = r*P 571 P = Pt[i] 572 573 return Pt
574 575
576 -def plotRaHist(arr):
577 ''' 578 Plots a record array 579 as a panel of histograms 580 ''' 581 nv = len(arr.dtype.names) 582 fs = (numpy.ceil(numpy.sqrt(nv)),numpy.floor(numpy.sqrt(nv))+1) #figure size 583 P.figure() 584 for i,n in enumerate(arr.dtype.names): 585 P.subplot(nv/2+1,2,i+1) 586 P.hist(arr[n],bins=50, normed=1, label=n) 587 P.legend()
588 589
590 -def main2():
591 start = time() 592 Me = Meld(K=5000,L=1000,model=model, ntheta=2,nphi=1,verbose=False,viz=False) 593 Me.setTheta(['r','p0'],[stats.uniform,stats.uniform],[(2,4),(0,5)]) 594 Me.setPhi(['p'],[stats.uniform],[(6,9)],[(6,9)]) 595 #Me.add_data(normal(7.5,1,400),'normal',(6,9)) 596 #Me.run() 597 Me.sir(data ={'p':[7.5]} ) 598 pt,pp = Me.getPosteriors(1) 599 end = time() 600 plotRaHist(pt) 601 plotRaHist(pp) 602 P.show() 603 print end-start, ' seconds'
604 605 if __name__ == '__main__': 606 607 main2() 608