Package PyDSTool :: Package Toolbox :: Module data_analysis
[hide private]
[frames] | no frames]

Source Code for Module PyDSTool.Toolbox.data_analysis

  1  """ 
  2  Data analysis utilities 
  3   
  4  Robert Clewley, August 2006 
  5  """ 
  6   
  7  from __future__ import division 
  8  from PyDSTool.Points import Point, Pointset 
  9  from PyDSTool.Interval import Interval 
 10  from PyDSTool.utils import intersect, filteredDict 
 11  from PyDSTool.errors import * 
 12  from PyDSTool.common import _num_types, _int_types, _seq_types, args, \ 
 13       sortedDictLists, sortedDictValues, sortedDictKeys 
 14  import numpy as npy 
 15  from numpy import array, dot, zeros, transpose, shape, extract, mean, std, \ 
 16       vstack, hstack, eye, ones, zeros, linalg, concatenate, \ 
 17       newaxis, r_, flipud, convolve, matrix, asarray 
 18  from numpy.linalg import norm 
 19  from scipy.signal import lfilter, butter 
 20  from scipy.optimize import minpack, optimize 
 21   
 22  try: 
 23      from mdp.nodes import PCANode 
 24  except: 
25 - def PCANode(*args, **kwargs):
26 raise ImportError("MDP must be installed for this functionality")
27 from PyDSTool.matplotlib_import import * 28 from copy import copy 29 30 31 _PCA_utils = ['doPCA', 'get_evec', 'pca_dim', 'get_residual', 32 'plot_PCA_residuals', 'plot_PCA_spectrum'] 33 34 _metric_utils = ['log_distances', 'log_distances_with_D'] 35 36 _analysis_utils = ['find_knees', 'whiten', 'find_closest_val', 'out_of_seq', 37 'find_nearby_ball', 'find_nearby_annulus', 'data_bins', 38 'find_central_point', 'find_diameter', 'find_from_sorted', 39 'dist_between_datasets', 'find_recurrences'] 40 41 _fit_utils = ['get_linear_regression_residual', 'fitline'] 42 43 _filter_utils = ['lfilter_zi', 'filtfilt', 'lfilter', 'butter', 'rectify'] 44 45 __all__ = _PCA_utils + _metric_utils + _analysis_utils + _fit_utils + \ 46 _filter_utils 47 48 # ------------------------------------------------------------------------- 49 50 # Filter utils (reproduced from SciPy Cookbook) 51
52 -def lfilter_zi(b,a):
53 #compute the zi state from the filter parameters. see [Gust96]. 54 55 #Based on: 56 # [Gust96] Fredrik Gustafsson, Determining the initial states in forward-backward 57 # filtering, IEEE Transactions on Signal Processing, pp. 988--992, April 1996, 58 # Volume 44, Issue 4 59 60 n=max(len(a),len(b)) 61 62 zin = (eye(n-1) - hstack( (-a[1:n,newaxis], 63 vstack((eye(n-2), zeros(n-2)))))) 64 65 zid= b[1:n] - a[1:n]*b[0] 66 67 zi_matrix=linalg.inv(zin)*(matrix(zid).transpose()) 68 zi_return=[] 69 70 #convert the result into a regular array (not a matrix) 71 for i in range(len(zi_matrix)): 72 zi_return.append(float(zi_matrix[i][0])) 73 74 return array(zi_return)
75 76 77 78
79 -def filtfilt(b,a,x):
80 #For now only accepting 1d arrays 81 ntaps=max(len(a),len(b)) 82 edge=ntaps*3 83 84 if x.ndim != 1: 85 raise ValueError("Filiflit is only accepting 1 dimension arrays.") 86 87 #x must be bigger than edge 88 if x.size < edge: 89 raise ValueError, \ 90 "Input vector needs to be bigger than 3 * max(len(a),len(b)." 91 92 if len(a) < ntaps: 93 a=r_[a,zeros(len(b)-len(a))] 94 95 if len(b) < ntaps: 96 b=r_[b,zeros(len(a)-len(b))] 97 98 zi=lfilter_zi(b,a) 99 100 #Grow the signal to have edges for stabilizing 101 #the filter with inverted replicas of the signal 102 s=r_[2*x[0]-x[edge:1:-1],x,2*x[-1]-x[-1:-edge:-1]] 103 #in the case of one go we only need one of the extremes 104 # both are needed for filtfilt 105 106 (y,zf)=lfilter(b,a,s,-1,zi*s[0]) 107 108 (y,zf)=lfilter(b,a,flipud(y),-1,zi*y[-1]) 109 110 return flipud(y[edge-1:-edge+1])
111 112 113 # Principal Component Analysis utilities 114
115 -def doPCA(in_pts, D, tol=.8):
116 pts=copy(in_pts) 117 pts.shape = (len(in_pts),D) 118 p=PCANode(output_dim=tol) 119 p.train(pts) 120 p.stop_training() 121 return p
122 123
124 -def get_evec(p, dim):
125 # extra call to array() to make it contiguous 126 v = array(transpose(p.v)[dim-1]) 127 v.shape = (totdim, 3) 128 return v
129 130
131 -def pca_dim(pca, covering, data, refpt, tol=0.8):
132 if type(refpt) != int: 133 refpt = refpt[0] 134 pca[refpt] = doPCA(nhd(data, covering, refpt), tol) 135 print "PCA eigenvalues for nhd #%i:"%refpt 136 print pca[refpt].d 137 return len(pca[refpt].d)
138 139
140 -def get_residual(pcanode, pts):
141 pvals = pcanode.execute(pts, pcanode.get_output_dim()) 142 pproj = mat(pvals)*mat(transpose(pcanode.v)) 143 res = pts-array(pproj) # orthogonal complement of projection onto PCs 144 resval_part = norm(res) 145 resval = norm(resval_part) 146 return resval
147 148
149 -def plot_PCA_residuals(data, D=None, newfig=True, marker='o', 150 silent=False):
151 if D is None: 152 D = shape(data)[1] 153 p=doPCA(data, D, D) 154 spec=zeros((D,1),float) 155 for i in range(1,D): 156 spec[i] = norm(p.d[:i]) 157 res = transpose(sqrt(spec[-1]**2-spec**2)/spec[-1])[0] 158 if not silent: 159 print "L2-norm of PCA spectrum =", spec[-1] 160 if newfig: 161 figure() 162 style='k'+marker 163 else: 164 style=marker 165 semilogy(res,style) 166 ## title('PCA residuals') 167 xlabel(r'$\rm{Dimension}$',fontsize=20) 168 ylabel(r'$\rm{PCA \ residual}$',fontsize=20) 169 return p, res
170 171
172 -def plot_PCA_spectrum(data, D):
173 p=doPCA(data, D, D) 174 spec=zeros((D,1),float) 175 for i in range(1,D): 176 spec[i] = norm(p.d[:i]) 177 figure();plot(spec,'ko') 178 title('L2-norm of spectrum of singular values') 179 xlabel('D') 180 ylabel('|s|') 181 return p
182 183 # ----------------------------------------------------------------------------- 184 185
186 -def log_distances(m, sampleix=0, doplot=True, quiet=True, logv=None, 187 plotstyle=None, return_ixs=False):
188 """Log distances (L2-norm) of data points from a reference point (sampleix). 189 190 For multiple calls to this function on the same data, it's faster to 191 pass logv precomputed. 192 193 return_ixs=True also returns the point indices from the sort to 194 be returned in this argument in a third return value. 195 """ 196 npts = size(m,0) 197 assert sampleix >= 0 198 d = zeros((npts-1,), 'd') 199 if rank(m) == 3: 200 for i in xrange(npts): 201 if sampleix != i: 202 try: 203 d[i] = norm(m[sampleix,:,:]-m[i,:,:]) 204 except IndexError: 205 # catch case when index is too large for npts-1 206 # so use the empty i=pix position (ordering doesn't matter) 207 d[sampleix] = norm(m[sampleix,:,:]-m[i,:,:]) 208 elif rank(m) == 2: 209 for i in xrange(npts): 210 if sampleix != i: 211 try: 212 d[i] = norm(m[sampleix,:]-m[i,:]) 213 except IndexError: 214 # catch case when index is too large for npts-1 215 # so use the empty i=pix position (ordering doesn't matter) 216 d[sampleix] = norm(m[sampleix,:]-m[i,:]) 217 else: 218 raise ValueError("Rank of input data must be 2 or 3") 219 if return_ixs: 220 # return sorted indices in that list argument 221 # (assumed to be an empty list) 222 ixs = array(argsort(d)) 223 d = d[ixs] 224 else: 225 # just sort 226 d.sort() 227 if not quiet: 228 print "Chose reference point %i"%sampleix 229 print "Min distance = %f, max distance = %f"%(d[0], d[-1]) 230 logd = log(d).ravel() 231 if logv is None: 232 logv = log(range(1,len(d)+1)) 233 nan_ixs = isfinite(logd) # mask, not a list of indices 234 logd = extract(nan_ixs, logd) 235 logv = extract(nan_ixs, logv) 236 if doplot: 237 if plotstyle is None: 238 plot(logd,logv) 239 else: 240 plot(logd,logv,plotstyle) 241 if return_ixs: 242 ixs = extract(nan_ixs, ixs) 243 return (logv, logd, ixs) 244 else: 245 return (logv, logd)
246 247 248 # used for log_distances_with_D 249 dict_tol = array([1e-8]) 250 251
252 -def log_distances_with_D(m, sampleix, logv=None, return_ixs=False):
253 """Log distances (L2-norm) of data points from a reference point (sampleix). 254 Additionally, it returns a list of the inter-point distances, 255 relative to the reference point, and the inverse of this mapping. 256 257 For multiple calls to this function on the same data, it's faster to 258 pass logv precomputed. 259 260 return_ixs=True returns the point indices from the sort in a fifth 261 return value. This identifies the points in the returned 262 list of inter-point distances and its inverse. 263 """ 264 npts = size(m,0) 265 dk = zeros((npts-1,), 'd') 266 d_inv = {} 267 # do in two stages: ix 0->sampleix, sampleix+1->npts-1 268 # but offset second stage indices in dk so there's a total of n-1 269 # this is basically a slightly unrolled loop to make it a little faster 270 if rank(m) == 3: 271 for i in xrange(sampleix): 272 dk[i] = norm(m[sampleix,:,:]-m[i,:,:]) 273 if dk[i] in d_inv: 274 # ensure we find a unique distance for dictionary key 275 # (doesn't matter if we adjust it at very fine resolution 276 # to become unique) 277 done = 0 278 u = dk[i] 279 sgn = 1 280 lim=40 281 while done < lim: 282 u = dk[i] + sgn*done*dict_tol 283 if u[0] not in d_inv: 284 done = lim+1 # avoids exception below 285 else: 286 if sgn == -1: 287 sgn = 1 288 else: 289 done += 1 290 sgn = -1 291 if done == lim: 292 raise ValueError("Non-unique distance found") 293 dk[i] = u 294 d_inv[u[0]] = i 295 else: 296 d_inv[dk[i]] = i 297 for i in xrange(sampleix+1, npts): 298 ki = i-1 299 dk[ki] = norm(m[sampleix,:,:]-m[i,:,:]) 300 if dk[ki] in d_inv: 301 # ensure we find a unique distance for dictionary key 302 # (doesn't matter if we adjust it at very fine resolution 303 # to become unique) 304 done = 0 305 u = dk[ki] 306 sgn = 1 307 lim=40 308 while done < lim: 309 u = dk[ki] + sgn*done*dict_tol 310 if u[0] not in d_inv: 311 done = lim+1 # avoids exception below 312 else: 313 if sgn == -1: 314 sgn = 1 315 else: 316 done += 1 317 sgn = -1 318 if done == lim: 319 raise ValueError("Non-unique distance found") 320 dk[ki] = u 321 d_inv[u[0]] = i 322 else: 323 d_inv[dk[ki]] = i 324 elif rank(m) == 2: 325 for i in xrange(sampleix): 326 dk[i] = norm(m[sampleix,:]-m[i,:]) 327 if dk[i] in d_inv: 328 # ensure we find a unique distance for dictionary key 329 # (doesn't matter if we adjust it at very fine resolution 330 # to become unique) 331 done = 0 332 u = dk[i] 333 sgn = 1 334 lim=40 335 while done < lim: 336 u = dk[i] + sgn*done*dict_tol 337 if u[0] not in d_inv: 338 done = lim+1 # avoids exception below 339 else: 340 if sgn == -1: 341 sgn = 1 342 else: 343 done += 1 344 sgn = -1 345 if done == lim: 346 raise ValueError("Non-unique distance found") 347 dk[i] = u 348 d_inv[u[0]] = i 349 else: 350 d_inv[dk[i]] = i 351 for i in xrange(sampleix+1, npts): 352 ki = i-1 353 dk[ki] = norm(m[sampleix,:]-m[i,:]) 354 if dk[ki] in d_inv: 355 # ensure we find a unique distance for dictionary key 356 # (doesn't matter if we adjust it at very fine resolution 357 # to become unique) 358 done = 0 359 u = dk[ki] 360 sgn = 1 361 lim=40 362 while done < lim: 363 u = dk[ki] + sgn*done*dict_tol 364 if u[0] not in d_inv: 365 done = lim+1 # avoids exception below 366 else: 367 if sgn == -1: 368 sgn = 1 369 else: 370 done += 1 371 sgn = -1 372 if done == lim: 373 raise ValueError("Non-unique distance found") 374 dk[ki] = u 375 d_inv[u[0]] = i 376 else: 377 d_inv[dk[ki]] = i 378 379 else: 380 raise ValueError("Rank of input data must be 2 or 3") 381 d = copy(dk) # dk remains unsorted 382 if return_ixs: 383 # return sorted indices in that list argument 384 # (assumed to be an empty list) 385 ixs = array(argsort(d)) 386 d = d[ixs] 387 else: 388 # just sort 389 d.sort() 390 logd = log(d).ravel() 391 if logv is None: 392 logv = log(range(1,len(d)+1)) 393 nan_ixs = isfinite(logd) # mask, not a list of indices 394 logd = extract(nan_ixs, logd) 395 logv = extract(nan_ixs, logv) 396 d = extract(nan_ixs, d) 397 ## for i in range(len(d)): 398 ## if nan_ixs[i] == 0: 399 ## try: 400 ## del d_inv[dk[i][0]] 401 ## print "Deleted key %i @ %f"%(i,dk[i][0]) 402 ## except KeyError: 403 ## print "Failed to delete key %i @ %f"%(i,dk[i][0]) 404 #### raise 405 if return_ixs: 406 ixs = extract(nan_ixs, ixs) 407 return (logv, logd, d, d_inv, ixs) 408 else: 409 return (logv, logd, d, d_inv)
410 411 # ---------------------------------------------------------------------------- 412
413 -def rectify(xa, full=False):
414 """Half-wave (default) or full-wave signal rectification: specify using 415 optional full Boolean flag. 416 xa must be an array""" 417 if full: 418 pos = npy.asarray(xa>0,int) 419 return pos*xa - (1-pos)*xa 420 else: 421 return npy.asarray(xa>0,int)*xa
422 423 424
425 -def get_linear_regression_residual(pfit, x, y, weight='', w=0):
426 """Also see scipy.stat.linregress for a more sophisticated version 427 """ 428 res = 0 429 if weight == 'lo': 430 for i in range(len(x)): 431 res += ((y[i]-(pfit[0]*x[i]+pfit[1]))/(i+1)/w)**2 432 elif weight == 'hi': 433 l = len(x) 434 for i in range(l): 435 res += ((y[i]-(pfit[0]*x[i]+pfit[1]))/(l-i+1)/w)**2 436 else: 437 for i in range(len(x)): 438 res += (y[i]-(pfit[0]*x[i]+pfit[1]))**2 439 return sqrt(res)
440 441
442 -def fitline(x, y, lo=0, hi=1, doplot=True, quiet=True, linewidth=2):
443 """fitline takes the position of low and high fit limits and 444 returns the slope. Integer indices in the x data set can be 445 specified for the low and high limits, otherwise use a fraction 446 between 0 and 1. 447 448 In application to fractal dimension estimation, if 449 x = log d (radius, a.k.a. inter-point distance) and 450 y = log v (index, a.k.a. estimate of volume at a given radius) 451 then the slope is the dimension estimate of the data set. 452 453 Also see scipy.stat.linregress for a more sophisticated version 454 """ 455 lendat = len(x) 456 assert hi > lo 457 if lo >= 0 and hi <= 1: 458 loix = int(lendat*lo) 459 hiix = int(lendat*hi) 460 elif lo >=0 and hi > 0: 461 loix = int(lo) 462 hiix = int(hi) 463 else: 464 raise ValueError("Invalid low and high fit limits") 465 if not quiet: 466 print "lo ix %d, hi ix %d, max ix %d"%(loix, hiix, lendat-1) 467 pfit = polyfit(x[loix:hiix],y[loix:hiix],1) 468 # print pfit 469 if doplot: 470 plot([x[loix],x[hiix]], 471 [x[loix]*pfit[0]+pfit[1],x[hiix]*pfit[0]+pfit[1]], 472 linewidth=linewidth) 473 return pfit[0]
474 475 476 477 # ---------------------------------------------------------------------------- 478 479 # Data analysis utilities 480
481 -class data_bins(object):
482 """Class for data binning. Indexed by bin number 0 - (num_bins-1). 483 484 Initialization arguments: 485 coordinate -- name of coordinate that occupies bins 486 bin_ords -- ordered sequence of n bin ordinates (integers or floats) 487 defining n-1 bins 488 valuedict -- optional dictionary containing initial values for 489 bins: keys = ordinate values, 490 values = coordinate values 491 tolerance -- for resolving bin edges with finite precision 492 arithmetic (defaults to 1e-12) 493 """ 494
495 - def __init__(self, coordinate, bin_ords, valuedict=None, tolerance=1e-12):
496 assert isincreasing(bin_ords), \ 497 "Bin ordinates must be given as a sequence in increasing order" 498 # bins are referenced internally by integer index 499 self.num_bins = len(bin_ords)-1 500 assert self.num_bins > 0, "Must be more than 2 ordinates to define bins" 501 self.bin_ords = array(bin_ords) 502 self.bins = dict(zip(range(self.num_bins),[0]*self.num_bins)) 503 self.coordinate = coordinate 504 self.tolerance = tolerance 505 self.intervals = [Interval('bin_%s'%i, float, 506 (bin_ords[i],bin_ords[i+1]), \ 507 abseps=tolerance) for i in range(self.num_bins)] 508 self.midpoints = (self.bin_ords[1:]+self.bin_ords[:-1])/2. 509 if valuedict is not None: 510 for k, v in valuedict.iteritems(): 511 self[self.resolve_bin_index(k)] = v
512
513 - def resolve_bin_index(self, ordinate):
514 """Find bin number (index) associated with ordinate""" 515 try: 516 # return first bin index for which ordinate in bin interval is 517 # 'uncertain' or 'contained' 518 return [interval.contains(ordinate) is not notcontained \ 519 for interval in self.intervals].index(True) 520 except ValueError: 521 # no bin found 522 raise ValueError("No corresponding bin for this ordinate")
523
524 - def __call__(self, ordinate):
525 """Return content of the bin that ordinate resolves to. 526 """ 527 return self.bins[self.resolve_bin_index(ordinate)]
528
529 - def __getitem__(self, bin_index):
530 if isinstance(bin_index, _int_types): 531 # bin_index is a single index, so... 532 # return first bin for which ordinate in bin interval is 533 # 'uncertain' or 'contained' 534 return self.bins[bin_index] 535 elif isinstance(bin_index, slice): 536 if bin_index.step is not None: 537 if bin_index.step != 1: 538 raise ValueError("Cannot step in slice of bins") 539 if bin_index.start is None: 540 start = 0 541 else: 542 start = bin_index.start 543 if bin_index.stop is None: 544 stop = self.num_bins 545 else: 546 stop = bin_index.stop 547 assert start < stop, \ 548 "Slice must go in increasing order" 549 new_bin_ords = self.bin_ords[start:stop+1] 550 b = bin(self.coordinate, new_bin_ords, 551 tolerance=self.tolerance) 552 for x in new_bin_ords[:-1]: 553 bin_ix = self.resolve_bin_index(x+self.tolerance) 554 b.bins[bin_ix] = self.bins[bin_ix] 555 return b 556 else: 557 raise ValueError("Invalid indexing of bins")
558
559 - def __setitem__(self, bin_index, value):
560 """Set value of a given bin number (index).""" 561 self.bins[bin_index] = value
562
563 - def increment(self, ordinate, inc=1):
564 """Increment the bin corresponding to the ordinate argument. 565 Optional argument inc sets increment amount (default 1). 566 """ 567 self.bins[self.resolve_bin_index(ordinate)] += inc
568
569 - def bin(self, values):
570 """Increment bin contents according to values passed.""" 571 if isinstance(values, _seq_types): 572 for v in values: 573 self.increment(v) 574 else: 575 self.increment(values)
576
577 - def clear(self):
578 """Reset all bins to have zero contents.""" 579 self.bins = dict(zip(range(self.num_bins),[0]*self.num_bins))
580
581 - def __str__(self):
582 return "Binned data with intervals %s:\n By bin number: %s"%(str([i.get() \ 583 for i in self.intervals]), str(self.bins))
584 585 __repr__ = __str__ 586
587 - def to_pointset(self):
588 """Convert to a pointset""" 589 bin_ixs = xrange(self.num_bins) 590 return Pointset(indepvararray=range(self.num_bins), 591 coorddict={self.coordinate: [self.bins[ix] \ 592 for ix in bin_ixs], 593 'bin_limit_lo': [self.intervals[ix][0] \ 594 for ix in bin_ixs], 595 'bin_limit_hi': [self.intervals[ix][1] \ 596 for ix in bin_ixs] 597 })
598
599 - def to_arrays(self):
600 """Convert to two arrays of bin indices and values""" 601 x, y = sortedDictLists(self.bins, byvalue=False) 602 return (array(x), array(y))
603
604 - def values(self):
605 return sortedDictValues(self.bins)
606
607 - def keys(self):
608 return sortedDictKeys(self.bins)
609
610 - def mean(self):
611 """Mean of binned data""" 612 a = self.midpoints*array(sortedDictValues(self.bins)) 613 return sum(a) / sum(self.bins.values())
614
615 - def std(self):
616 """Standard deviation of binned data""" 617 vals = array(sortedDictValues(self.bins)) 618 a = self.midpoints*vals 619 sum_bin_vals = sum(self.bins.values()) 620 mean_val = sum(a) / sum_bin_vals 621 return sqrt(sum(((self.midpoints-mean_val)*vals)**2) / sum_bin_vals)
622 623
624 -def find_closest_val(x, a, eps=1e-7, which_norm=2):
625 """Find one value in a closer than eps to x""" 626 for aval in a: 627 if norm(x-aval, which_norm) < eps: 628 return aval 629 return None
630 631
632 -def find_nearby_ball(data, refix, r, which_norm=2, include_ref=False):
633 """Return indices of all points in data that are inside a ball of 634 radius r from the reference point (not included).""" 635 nearby = [] 636 refpt = data[refix] 637 if include_ref: 638 for i in xrange(size(data,0)): 639 if norm(refpt-data[i], which_norm) <= r: 640 nearby.append(i) 641 else: 642 for i in xrange(size(data,0)): 643 if i != refix: 644 if norm(refpt-data[i], which_norm) <= r: 645 nearby.append(i) 646 return nearby
647
648 -def find_nearby_annulus(data, refix, rlo, rhi, which_norm=2):
649 """Return a list containing input data's indices of all neighbours of the 650 reference point within the range of distances d such that dlo < d < dhi 651 """ 652 assert rlo > 0 and rhi > rlo 653 nearby = [] 654 for i in xrange(size(data,0)): 655 if i != refix: 656 d = norm(data[refix]-data[i], which_norm) 657 if d > rlo and d < rhi: 658 nearby.append(i) 659 return nearby
660
661 -def find_central_point(data, which_norm=2):
662 """Find the index of a point closest to the 'centre' of a data set. 663 The centre is defined to be where the mean of the points' position 664 vectors relative to the centre is minimized in each direction.""" 665 dim = data.shape[1] 666 centre = mean(data,0) 667 ds = array([norm(p, which_norm) for p in data-centre]) 668 ds_sort_ixs = ds.argsort() 669 return ds_sort_ixs[0]
670 671
672 -def find_recurrences(data, refix, r, times=None, ignore_small=0, 673 which_norm=2):
674 """Find recurrences of a trajectory in a ball of radius r centred at the 675 reference point given by refix in data. 676 677 If recurrence times are desired, pass an associated time array corresponding 678 to the points in the data. 679 680 If ignore_small > 0, recurrences of fewer than this number of data points 681 will be treated as anomalous and ignored completely. 682 683 Returns a structure (class common.args) having attributes: 684 partitions -- pairs of entry and exit indices for each recurrence in 685 the ball 686 ball_ixs -- all consecutive indices into data for each recurrence 687 partitions_lengths -- number of indices in each partition 688 rec_times -- times to next recurrence in the set from previous 689 """ 690 if times is not None: 691 assert len(times) == len(data), "Times must have same length as data" 692 # ball_ixs is already sorted 693 ball_ixs = find_nearby_ball(data, refix, r, which_norm, include_ref=True) 694 result = args(ball_ixs=[], partitions=[], partition_lengths=[], 695 rec_times=None) 696 if len(ball_ixs) == 0: 697 # No points found in ball 698 return result 699 # find boundaries of each partition between contiguous indices 700 partition_bds = out_of_seq(ball_ixs) 701 if len(partition_bds) == 0: 702 result.ball_ixs = ball_ixs 703 result.partitions = [(ball_ixs[0], ball_ixs[-1])] 704 result.partition_lengths = [ball_ixs[-1] - ball_ixs[0] + 1] 705 if times is not None: 706 # only update this if times present to be consistent with other 707 # changes of rec_times from default None 708 result.rec_times = [Inf] 709 return result 710 else: 711 plen = ball_ixs[partition_bds[0]-1] - ball_ixs[0] + 1 712 if plen > ignore_small: 713 partitions = [(ball_ixs[0], ball_ixs[partition_bds[0]-1])] 714 else: 715 partitions = [] 716 # find contiguous indices (and corresponding times) 717 for i in range(len(partition_bds)-1): 718 plo = ball_ixs[partition_bds[i]] 719 phi = ball_ixs[partition_bds[i+1]-1] 720 plen = phi - plo + 1 721 if plen > ignore_small: 722 partitions.append((plo, phi)) 723 plo = ball_ixs[partition_bds[-1]] 724 phi = ball_ixs[-1] 725 plen = phi - plo + 1 726 if plen > ignore_small: 727 partitions.append((plo, phi)) 728 result.ball_ixs = ball_ixs 729 result.partitions = partitions 730 result.partition_lengths = [p[1]-p[0]+1 for p in partitions] 731 if times is None: 732 return result 733 else: 734 result.rec_times = recurrence_times(times, partitions) 735 return result
736 737
738 -def find_diameter(data, eps, which_norm=2):
739 """Find approximate diameter of data set, up to tolerance defined by eps > 0. 740 741 Data assumed to be a rank-2 array. 742 """ 743 assert eps > 0, "Tolerance must be positive" 744 num_pts = len(data) 745 if num_pts > 400: 746 # If data well distributed, assume that 100 sample points is a good 747 # initial set to test 748 sample_rate = int(ceil(num_pts/100.)) 749 else: 750 # small enough data set to do by brute force on all points 751 sample_rate = 1 752 old_diam = -Inf 753 delta = Inf # change in estimate of radius on each iteration 754 while delta > eps: 755 #print "Sample rate = ", sample_rate 756 downsampled = data[::sample_rate] 757 mind, diam = dist_between_datasets(downsampled, downsampled, 758 which_norm) 759 if sample_rate == 1: 760 # can go no further! 761 delta = 0 762 break 763 else: 764 sample_rate = int(ceil(sample_rate/2.)) 765 delta = diam - old_diam 766 old_diam = diam 767 #print old_diam, delta 768 return diam
769 770
771 -def dist_between_datasets(data1, data2, which_norm=2):
772 maxd = -Inf 773 mind = Inf 774 for v1 in data1: 775 for v2 in data2: 776 d = norm(v1-v2, which_norm) 777 if d > maxd: 778 maxd = d 779 elif d < mind: 780 mind = d 781 return (mind, maxd)
782 783
784 -def out_of_seq(a, inc_only=False):
785 """Determine if and where an integer array a is not increasing or 786 decreasing in sequence by 1. 787 """ 788 v_old = a[0] 789 i = 0 790 out = [] 791 dirn = 0 792 for v in a[1:]: 793 i +=1 794 if v == v_old+1: 795 if dirn == -1: 796 out.append(i) 797 dirn = 1 798 elif v == v_old-1: 799 if dirn == 1: 800 out.append(i) 801 dirn = -1 802 else: 803 dirn = 0 804 out.append(i) 805 v_old = v 806 return out
807 808
809 -def whiten(data):
810 """Subtract mean and divide by standard deviation in each column of data""" 811 wdata=zeros(shape(data),Float) 812 for d in range(shape(data)[1]): 813 x = data[:,d] 814 wdata[:,d] = (x-mean(x))/std(x) 815 return wdata
816 817
818 -def find_knees(data, tol=1., inlog=False, verbose=False):
819 """ 820 Find one or more 'knees' in data, according to high second derivatives. 821 822 inlog option finds knees in the logs of the data entries. 823 tol=1. works well if inlog=False 824 tol=0.3 works well if inlog=True 825 """ 826 knee_ixs = [] 827 for i in range(1,len(data)-1): 828 if inlog: 829 frac = log(data[i+1])+log(data[i-1])-2*log(data[i]) 830 else: 831 d2 = data[i+1]+data[i-1]-2*data[i] 832 try: 833 frac = d2/data[i] 834 except ZeroDivisionError: 835 frac = Inf 836 if verbose: 837 print i, data[i], frac 838 if frac > tol and frac < Inf: 839 knee_ixs.append((i,frac)) 840 if verbose: 841 print "High second derivatives at: ", knee_ixs, "\n" 842 knees = [] 843 found = False # in a contiguous segment of high second derivatives 844 curr_kixs = [] 845 old_kix = None 846 for i in range(len(knee_ixs)): 847 process = False 848 kix, frac = knee_ixs[i] 849 if verbose: 850 print "Processing knee at index", kix 851 if kix-1 == old_kix: 852 if i == len(knee_ixs)-1: 853 # this is the last index so have to process 854 process = True 855 if found: 856 curr_kixs.append(i) 857 else: 858 curr_kixs = [i-1, i] 859 found = True 860 else: 861 process = old_kix is not None 862 if verbose: 863 print old_kix, found, curr_kixs 864 if process: 865 if found: 866 found = False 867 if verbose: 868 print "Current knee indices:", curr_kixs, 869 print [knee_ixs[k] for k in curr_kixs] 870 all_d2 = array([knee_ixs[k][1] for k in curr_kixs]) 871 ixs_sort = argsort(all_d2) 872 max_ix = ixs_sort[-1] 873 knees.append(knee_ixs[curr_kixs[max_ix]][0]) 874 curr_kixs = [] 875 else: 876 if verbose: 877 print "Appending knee index", old_kix 878 knees.append(old_kix) 879 old_kix = kix 880 # add final singleton index of high derivative, if any 881 if not found and old_kix not in knees: 882 if verbose: 883 print "Appending knee index", old_kix 884 knees.append(old_kix) 885 return knees
886 887 888 # ----------------------------------------------------------------------------- 889 890 # Internal helper functions 891
892 -def recurrence_times(ts, partitions):
893 """Internal function for use by find_recurrences""" 894 rtimes = [] 895 if len(partitions) == 1: 896 return [Inf] 897 for i in range(len(partitions)-1): 898 (plo, phi) = partitions[i] 899 rtimes.append(ts[partitions[i+1][0]]-ts[partitions[i][1]]) 900 return rtimes
901 902
903 -def find_from_sorted(x, v, next_largest=1, indices=None):
904 if isinstance(v, _seq_types): 905 # sequence v 906 return [find(array(x), y, next_largest, indices) for y in v] 907 else: 908 # singleton v 909 return find(array(x), v, next_largest, indices)
910
911 -def colormap(mag, cmin, cmax):
912 """ 913 Return a tuple of floats between 0 and 1 for the red, green and 914 blue amplitudes. 915 Function originally named floatRgb and written by Alexander Pletzer, 916 from the Python Cookbook. 917 """ 918 919 try: 920 # normalize to [0,1] 921 x = float(mag-cmin)/float(cmax-cmin) 922 except: 923 # cmax = cmin 924 x = 0.5 925 blue = min((max((4*(0.75-x), 0.)), 1.)) 926 red = min((max((4*(x-0.25), 0.)), 1.)) 927 green= min((max((4*math.fabs(x-0.5)-1., 0.)), 1.)) 928 return (red, green, blue)
929 930
931 -def dump_progress(x, tot, filename='progress_info.dat'):
932 """Send progress report to a file""" 933 f=open(filename, 'w') 934 f.write("Progress: %d / %d \n"%(x,tot)) 935 f.write("Date and time of update: " + " ".join([str(x) for x in localtime()[:-3]]) + "\n\n") 936 f.close()
937 938
939 -def second_diff(data, i):
940 """Naive second difference formula at position i of data""" 941 return data[i+1]+data[i-1]-2*data[i]
942 943
944 -def _adjust_ix(i, n):
945 """Internal helper function""" 946 if i >= n: 947 return i+1 948 else: 949 return i
950