1
2
3
4
5
6
7
8
9
10
11
12
13 from numpy.core.records import recarray
14 try:
15 import psyco
16 psyco.full()
17 except:
18 pass
19 import sys, os
20 import cPickle as CP
21 import like
22 import pylab as P
23 import scipy.stats.kde as kde
24 from scipy import stats
25 import numpy
26 from numpy import array, nan_to_num, zeros, product, exp, ones,mean
27 from time import time
28 from numpy.random import normal, randint, random
29 from BIP.Viz.realtime import RTplot
30 import lhs
31
32 from multiprocessing import Pool
33
34 __docformat__ = "restructuredtext en"
35
36
38 """
39 Bayesian Melding class
40 """
41 - def __init__(self, K, L, model, ntheta, nphi, alpha = 0.5, verbose = False, viz=False ):
42 """
43 Initializes the Melding class.
44
45 :Parameters:
46 - `K`: Number of replicates of the model run. Also determines the prior sample size.
47 - `L`: Number of samples from the Posterior distributions. Usually 10% of K.
48 - `model`: Callable taking theta as argument and returning phi = M(theta).
49 - `ntheta`: Number of inputs to the model (parameters).
50 - `nphi`: Number of outputs of the model (State-variables)
51 - `verbose`: Boolean: whether to show more information about the computations
52 - `viz`: Boolean. Wether to show graphical outputs of the fitting process
53 """
54 self.K = K
55 self.L = L
56 self.verbose = verbose
57 self.model = model
58 self.likelist = []
59 self.q1theta = recarray(K,formats=['f8']*ntheta)
60 self.post_theta = recarray(L,formats=['f8']*ntheta)
61 self.q2phi = recarray(K,formats=['f8']*nphi)
62 self.phi = recarray(K,formats=['f8']*nphi)
63 self.q2type = []
64 self.post_phi = recarray(L,formats=['f8']*nphi)
65 self.ntheta = ntheta
66 self.nphi = nphi
67 self.alpha = alpha
68 self.done_running = False
69 self.viz = viz
70
71
72 - def setPhi(self, names, dists=[stats.norm], pars=[(0, 1)], limits=[(-5,5)]):
73 """
74 Setup the models Outputs, or Phi, and generate the samples from prior distributions
75 needed for the melding replicates.
76
77 :Parameters:
78 - `names`: list of string with the names of the variables.
79 - `dists`: is a list of RNG from scipy.stats
80 - `pars`: is a list of tuples of variables for each prior distribution, respectively.
81 - `limits`: lower and upper limits on the support of variables.
82 """
83 if len(names) != self.nphi:
84 raise ValueError("Number of names(%s) does not match the number of output variables(%s)."%(len(names),self.nphi))
85 self.q2phi.dtype.names = names
86 self.phi.dtype.names = names
87 self.post_phi.dtype.names = names
88 self.plimits = limits
89 for n,d,p in zip(names,dists,pars):
90 self.q2phi[n] = lhs.lhs(d,p,self.K).ravel()
91 self.q2type.append(d.name)
92
93
94
95 - def setTheta(self, names, dists=[stats.norm], pars=[(0, 1)]):
96 """
97 Setup the models inputs and generate the samples from prior distributions
98 needed for the dists the melding replicates.
99
100 :Parameters:
101 - `names`: list of string with the names of the parameters.
102 - `dists`: is a list of RNG from scipy.stats
103 - `pars`: is a list of tuples of parameters for each prior distribution, respectivelydists
104 """
105 self.q1theta.dtype.names = names
106 self.post_theta.dtype.names = names
107 if os.path.exists('q1theta'):
108 self.q1theta = CP.load(open('q1theta','r'))
109 else:
110 for n,d,p in zip(names,dists,pars):
111 self.q1theta[n] = lhs.lhs(d,p,self.K).ravel()
112
114 """
115 Setup the model inputs and set the prior distributions from the vectors
116 in data.
117 This method is to be used when the prior distributions are available in
118 the form of a sample from an empirical distribution such as a bayesian
119 posterior.
120 In order to expand the samples provided, K samples are generated from a
121 kernel density estimate of the original sample.
122
123 :Parameters:
124 - `names`: list of string with the names of the parameters.
125 - `data`: list of vectors. Samples of a proposed distribution
126 - `limits`: List of (min,max) tuples for each theta to make sure samples are not generated outside these limits.
127 """
128 self.q1theta.dtype.names = names
129 self.post_theta.dtype.names = names
130 if os.path.exists('q1theta'):
131 self.q1theta = CP.load(open('q1theta','r'))
132 else:
133 i = 0
134 for n,d in zip(names,data):
135 smp = []
136 while len(smp)<self.K:
137 smp += [x for x in kde.gaussian_kde(d).resample(self.K)[0] if x >= limits[i][0] and x <= limits[i][1]]
138
139 self.q1theta[n] = array(smp[:self.K])
140 i += 1
141
142
144 """
145 Setup the model outputs and set their prior distributions from the
146 vectors in data.
147 This method is to be used when the prior distributions are available in
148 the form of a sample from an empirical distribution such as a bayesian
149 posterior.
150 In order to expand the samples provided, K samples are generated from a
151 kernel density estimate of the original sample.
152
153 :Parameters:
154 - `names`: list of string with the names of the variables.
155 - `data`: list of vectors. Samples of the proposed distribution.
156 - `limits`: list of tuples (ll,ul),lower and upper limits on the support of variables.
157 """
158 self.q2phi.dtype.names = names
159 self.phi.dtype.names = names
160 self.post_phi.dtype.names = names
161 self.limits = limits
162 for n,d in zip(names,data):
163 i = 0
164 smp = []
165 while len(smp)<self.K:
166 smp += [x for x in kde.gaussian_kde(d).resample(self.K)[0] if x >= limits[i][0] and x <= limits[i][1]]
167 self.q2phi[n] = array(smp[:self.K])
168 self.q2type.append('empirical')
169 i += 1
170
171
172 - def addData(self, data, model, limits,l=1024, **kwargs):
173 """
174 Calculates the likelihood functions of the dataset presented and add to
175 self.likelist
176 Likelihood function is a vector of lenght l
177
178 :Parameters:
179 - `data`: vector containing observations on a given variable.
180 - `model`: string with the name of the distribution of the variable
181 - `limits`: (ll,ul) tuple with lower and upper limits for the variable
182 - `l`: Length (resolution) of the likelihood vector
183 """
184 n = len(data)
185 data = array(data)
186 (ll,ul) = limits
187 step = (ul-ll)/float(l)
188
189 if model == 'normal':
190 sd = std(data)
191 prec = 1/sd
192 res = array([exp(like.Normal(data,mu,prec)) for mu in arange(ll,ul,step)])
193 lik = res/max(res)
194 print max(lik), min(lik)
195 elif model == 'exponential':
196 res = [lamb**n*exp(-lamb*sum(data)) for lamb in arange(ll,ul,step)]
197 lik = array(res)/max(array(res))
198 elif model == 'beta':
199
200 res = [exp(like.Beta(data,*kwargs['pars'])) for i in arange(ll,ul,step)]
201 lik = array(res)/max(array(res))
202 elif model == 'bernoulli':
203 if ll<0 or ul>1:
204 print "Parameter p of the bernoulli is out of range[0,1]"
205 res = [exp(like.Bernoulli(data,p)) for p in arange(ll,ul,step)]
206 lik = array(res)/max(array(res))
207
208 elif model == 'poisson':
209 res = [exp(like.Poisson(data,lb)) for lb in arange(ll,ul,step)]
210 lik = array(res)/max(array(res))
211
212 elif model == 'lognormal':
213 sd = std(data)
214 prec = 1/sd
215 res = [exp(like.Lognormal(data,mu,prec)) for mu in arange(ll,ul,step)]
216 lik = array(res)/max(array(res))
217 else:
218 print 'Invalid distribution type. Valid distributions: normal,lognormal, exponential, bernoulli and poisson'
219 self.likelist.append(lik)
220 return lik
221
222 - def run(self,*args):
223 """
224 Runs the model through the Melding inference.model
225 model is a callable which return the output of the deterministic model,
226 i.e. the model itself.
227 The model is run self.K times to obtain phi = M(theta).
228 """
229
230 for i in xrange(self.K):
231 theta = [self.q1theta[n][i] for n in self.q1theta.dtype.names]
232 r = self.po.apply_async(self.model, theta)
233 self.phi[i]= r.get()[-1]
234
235 self.done_running = True
236
237 - def getPosteriors(self,t=1):
238 """
239 Updates the posteriors of the model's output for the last t time steps.
240 Returns two record arrays:
241 - The posteriors of the Theta
242 - the posterior of Phi last t values of time-series. self.L by `t` arrays.
243
244 :Parameters:
245 - `t`: length of the posterior time-series to return.
246 """
247 if not self.done_running:
248 return
249 if t > 1:
250 self.post_phi = recarray((self.L,t),formats=['f8']*self.nphi)
251 self.post_phi.dtype.names = self.phi.dtype.names
252 def cb(r):
253 '''
254 callback function for the asynchronous model runs.
255 r: tuple with results of simulatio (results, run#)
256 '''
257 if t == 1:
258 self.post_phi[r[1]] = (r[0][-1],)
259 else:
260 self.post_phi[r[1]]= [tuple(l) for l in r[0][-t:]]
261 po = Pool()
262
263 pti = lhs.lhs(stats.randint,(0,self.L),siz=(self.ntheta,self.L))
264 for i in xrange(self.L):
265 theta = [self.post_theta[n][pti[j,i]] for j,n in enumerate(self.post_theta.dtype.names)]
266 po.apply_async(enumRun, (self.model,theta,i), callback=cb)
267
268
269
270
271
272 if i%100 == 0 and self.verbose:
273 print "==> L = %s"%i
274
275 po.close()
276 po.join()
277 return self.post_theta, self.post_phi
278
279 - def filtM(self,cond,x,limits):
280 '''
281 Multiple condition filtering.
282 Remove values in x[i], if corresponding values in
283 cond[i] are less than limits[i][0] or greater than
284 limits[i][1].
285
286 :Parameters:
287 - `cond`: is an array of conditions.
288 - `limits`: is a list of tuples (ll,ul) with length equal to number of lines in `cond` and `x`.
289 - `x`: array to be filtered.
290 '''
291
292 names = []
293 if isinstance(cond, recarray):
294 names = list(cond.dtype.names)
295 cond = [cond[v] for v in cond.dtype.names]
296 x = [x[v] for v in x.dtype.names]
297
298 cond = array(cond)
299 cnd = ones(cond.shape[1],int)
300 for i,j in zip(cond,limits):
301 ll = j[0]
302 ul = j[1]
303
304 cnd = cnd & less(i,ul) & greater(i,ll)
305 f = compress(cnd,x, axis=1)
306
307 if names:
308 r = recarray((1,f.shape[1]),formats=['f8']*len(names),names=names)
309 for i,n in enumerate(names):
310 r[n]=f[i]
311 f=r
312
313 return f
314
316 '''
317 Calculates a basic fitness calculation between a model-
318 generated time series and a observed time series.
319 it uses a normalized RMS variation.
320
321 :Parameters:
322 - `s1`: model-generated time series. record array.
323 - `s2`: observed time series. dictionary with keys matching names of s1
324 '''
325 fit = []
326 for k in s2.keys():
327 if s2[k] == [] or (not s2[k].any()):
328 continue
329 e = sqrt(mean((s1[k]-s2[k])**2.))
330 fit.append(e)
331
332 return mean(fit)
333
334
336 """
337 Returns the probability associated with each phi[i]
338 on the pooled pdf of phi and q2phi.
339
340 :Parameters:
341 - `phi`: prior of Phi induced by the model and q1theta.
342 """
343
344
345 phidens = stats.gaussian_kde(array([phi[n][:,-1] for n in phi.dtype.names]))
346
347 q2dens = stats.gaussian_kde(array([self.q2phi[n] for n in self.q2phi.dtype.names]))
348
349
350 lastp = array([list(phi[i,-1]) for i in xrange(self.K)])
351
352 qtilphi = (phidens.evaluate(lastp.T)**(1-self.alpha))*q2dens.evaluate(lastp.T)**self.alpha
353 return qtilphi/sum(qtilphi)
354
355 - def abcRun(self,fitfun=None, data={}, t=1,savetemp=False):
356 """
357 Runs the model for inference through Approximate Bayes Computation
358 techniques. This method should be used as an alternative to the sir.
359
360 :Parameters:
361 - `fitfun`: Callable which will return the goodness of fit of the model to data as a number between 0-1, with 1 meaning perfect fit
362 - `t`: number of time steps to retain at the end of the of the model run for fitting purposes.
363 - `data`: dict containing observed time series (lists of length t) of the state variables. This dict must have as many items the number of state variables, with labels matching variables names. Unorbserved variables must have an empty list as value.
364 - `savetemp`: Should temp results be saved. Useful for long runs. Alows for resuming the simulation from last sa
365 """
366 if not fitfun:
367 fitfun = self.basicfit
368 if savetemp:
369 CP.dump(self.q1theta,open('q1theta','w'))
370
371 if os.path.exists('phi.temp'):
372 phi,j = CP.load(open('phi.temp','r'))
373 else:
374 j=0
375 phi = recarray((self.K,t),formats=['f8']*self.nphi, names = self.phi.dtype.names)
376 for i in xrange(j,self.K):
377 theta = [self.q1theta[n][i] for n in self.q1theta.dtype.names]
378 r = self.po.apply_async(self.model, theta)
379 phi[i]= [tuple(l) for l in r.get()[-t:]]
380 if i%100 == 0:
381 print "==> K = %s"%i
382 if savetemp:
383 CP.dump((phi,i),open('phi.temp','w'))
384 if savetemp:
385 os.unlink('phi.temp')
386 os.unlink('q1theta')
387
388 print "==> Done Running the K replicates\n"
389 qtilphi = self.logPooling(phi)
390 qtilphi = nan_to_num(qtilphi)
391 print 'max(qtilphi): ', max(qtilphi)
392
393
394 w = [fitfun(phi[i],data) for i in xrange(phi.shape[0])]
395 w /=sum(w)
396 w = 1-w
397 print "w=",w, mean(w), var(w)
398 print
399 print 'qtilphi=',qtilphi
400
401 w = nan_to_num(w)
402 w = array(w)*qtilphi
403 w /=sum(w)
404 w = nan_to_num(w)
405 print 'max(w): ',max(w)
406
407
408
409
410
411
412
413 if sum(w) == 0.0:
414 sys.exit('Resampling weights are all zero, please check your model or data.')
415 j = 0
416 while j < self.L:
417 i=randint(0,w.size)
418 if random()<= w[i]:
419 self.post_theta[j] = self.q1theta[i]
420 j+=1
421
422
423 self.done_running = True
424
425 - def sir(self, data={}, t=1,tau=0.1, nopool=False,savetemp=False):
426 """
427 Run the model output through the Sampling-Importance-Resampling algorithm.
428 Returns 1 if successful or 0 if not.
429
430 :Parameters:
431 - `data`: observed time series on the model's output
432 - `t`: length of the observed time series
433 - `tau`: Precision of the Normal likelihood function
434 - `nopool`: True if no priors on the outputs are available. Leads to faster calculations
435 - `savetemp`: Boolean. create a temp file?
436 """
437 phi = self.runModel(savetemp,t)
438
439 if nopool:
440 qtilphi = ones(self.K)
441 else:
442 t0 = time()
443 qtilphi = self.logPooling(phi)
444 print "==> Done Running the Log Pooling (took %s seconds)\n"%(time()-t0)
445 qtilphi = nan_to_num(qtilphi)
446 print 'max(qtilphi): ', max(qtilphi)
447 if sum(qtilphi)==0:
448 print 'Pooled prior on ouputs is null, please check your priors, and try again.'
449 return 0
450
451
452 lik = zeros(self.K)
453 t0=time()
454
455 for i in xrange(self.K):
456 l=1
457 for n in data.keys():
458 if isinstance(data[n],list) and data[n] == []:
459 continue
460 elif isinstance(data[n],numpy.ndarray) and (not data[n].any()):
461 continue
462 p = phi[n]
463
464
465
466
467 l *= product([exp(like.Normal(data[n][m], j,tau)) for m,j in enumerate(p[i])])
468 lik[i]=l
469
470
471 if self.viz:
472 fitplot = RTplot()
473 fitplot.plotlines(data,style='points')
474 fitplot.plotlines(phi,style='points')
475 print "==> Done Calculating Likelihoods (took %s seconds)"%(time()-t0)
476 lr = nan_to_num(max(lik)/min(lik))
477 print "==> Likelihood ratio of best run/worst run: %s"%(lr,)
478
479 w = nan_to_num(qtilphi*lik)
480 w = nan_to_num(w/sum(w))
481
482 if not sum(w) == 0.0:
483 j = 0
484 t0 = time()
485 maxw = 0;minw = max(w)
486 while j < self.L:
487 i=randint(0,w.size)
488 if random()*max(w)<= w[i]:
489 self.post_theta[j] = self.q1theta[i]
490 maxw = max(maxw,w[i])
491 minw = min(minw,w[i])
492 j+=1
493 if not j%100 and self.verbose:
494 print j, "of %s"%self.L
495 self.done_running = True
496 print "==> Done Resampling (L=%s) priors (took %s seconds)"%(self.L,(time()-t0))
497 wr = maxw/minw
498 print "==> Likelihood ratio of best/worst retained runs: %s"%(wr,)
499 if wr == 1:
500 print "==> Flat likelihood, trying again..."
501
502 print "==> Improvement: %s percent"%(100-100*wr/lr,)
503 else:
504 print 'Resampling weights are all zero, please check your model or data, and try again.\n'
505 print '==> Likelihood (min,mean,max): ',min(lik),mean(lik),max(lik)
506 return 0
507 return 1
508
510 '''
511 Handles running the model self.K times keeping a temporary savefile for
512 resuming calculation in case of interruption.
513
514 :Parameters:
515 - `savetemp`: Boolean. create a temp file?
516 '''
517 if savetemp:
518 CP.dump(self.q1theta,open('q1theta','w'))
519
520
521
522 if os.path.exists('phi.temp'):
523 phi,j = CP.load(open('phi.temp','r'))
524 else:
525 j=0
526 phi = recarray((self.K,t),formats=['f8']*self.nphi, names = self.phi.dtype.names)
527 def cb(r):
528 '''
529 callback function for the asynchronous model runs
530 '''
531 if t == 1:
532 phi[r[1]] = (r[0][-1],)
533 else:
534 phi[r[1]] = [tuple(l) for l in r[0][-t:]]
535
536 po = Pool()
537 t0=time()
538 for i in xrange(j,self.K):
539 theta = [self.q1theta[n][i] for n in self.q1theta.dtype.names]
540 r = po.apply_async(enumRun,(self.model,theta,i),callback=cb)
541
542
543
544
545
546 if i%100 == 0 and self.verbose:
547 print "==> K = %s"%i
548 if savetemp:
549 CP.dump((phi,i),open('phi.temp','w'))
550 if savetemp:
551 os.unlink('phi.temp')
552 os.unlink('q1theta')
553 po.close()
554 po.join()
555 print "==> Done Running the K (%s) replicates (took %s seconds)\n"%(self.K,(time()-t0))
556
557 return phi
559 """
560 Returns model results plus run number.
561
562 :Parameters:
563 - `model`: model callable
564 - `theta`: model input list
565 - `k`: run number
566
567 :Return:
568 - res: result list
569 - `k`: run number
570 """
571 res =model(*theta)
572 return (res,k)
573
575 """
576 Model (r,p0, n=1)
577 Simulates the Population dynamic Model (PDM) Pt = rP0
578 for n time steps.
579 P0 is the initial population size.
580 Example model for testing purposes.
581 """
582
583 Pt = zeros(n, float)
584 P = p0
585 for i in xrange(n):
586 Pt[i] = r*P
587 P = Pt[i]
588
589 return Pt
590
591
593 '''
594 Plots a record array
595 as a panel of histograms
596 '''
597 nv = len(arr.dtype.names)
598 fs = (numpy.ceil(numpy.sqrt(nv)),numpy.floor(numpy.sqrt(nv))+1)
599 P.figure()
600 for i,n in enumerate(arr.dtype.names):
601 P.subplot(nv/2+1,2,i+1)
602 P.hist(arr[n],bins=50, normed=1, label=n)
603 P.legend()
604
605
607 start = time()
608 Me = Meld(K=10000,L=2000,model=model, ntheta=2,nphi=1,verbose=True,viz=True)
609 Me.setTheta(['r','p0'],[stats.uniform,stats.uniform],[(2,4),(0,5)])
610 Me.setPhi(['p'],[stats.uniform],[(6,9)],[(6,9)])
611
612
613 Me.sir(data ={'p':[7.5]} )
614 pt,pp = Me.getPosteriors()
615 end = time()
616 plotRaHist(pt)
617 plotRaHist(pp)
618 P.show()
619 print end-start, ' seconds'
620
621 if __name__ == '__main__':
622 main2()
623