Package csb :: Package statistics
[frames] | no frames]

Source Code for Package csb.statistics

  1  """ 
  2  Statistics root package. 
  3   
  4  This package contains a number of common statistical utilities. Sub-packages 
  5  provide more specialized APIs, for example L{csb.statistics.pdf} defines the 
  6  probability density object model. 
  7  """ 
  8   
9 -class Cumulative(object):
10 11 total_mem = 1e8 12
13 - def __init__(self, data):
14 15 self.data = data
16
17 - def __call__(self, x, nchunks=None):
18 19 from numpy import greater, reshape, concatenate 20 21 c = [] 22 x = reshape(x, (-1,)) 23 24 if nchunks is None: 25 26 total_size = len(x) * len(self.data) 27 nchunks = total_size / self.total_mem + int(total_size % self.total_mem != 0) 28 29 size = len(x) / nchunks + int(len(x) % nchunks != 0) 30 31 while len(x): 32 33 y = x[:size] 34 x = x[size:] 35 36 c = concatenate((c, greater.outer(y, self.data).sum(1) / float(len(self.data)))) 37 38 return c
39
40 - def cumulative_density(self, x, nchunks=None):
41 return 1 - self.__call__(x, nchunks)
42
43 -def geometric_mean(x, axis=None):
44 """ 45 @param x: 46 @param axis: compute the geometric mean along this axis 47 48 @return: geometric mean of x 49 """ 50 from numpy import exp, log, clip, mean 51 52 return exp(mean(log(clip(x, 1e-300, 1e300)), axis))
53
54 -def harmonic_mean(x, axis=None):
55 """ 56 @param x: 57 @param axis: compute the harmonic mean along this axis 58 59 @return: harmonic mean of x 60 """ 61 from numpy import mean 62 63 return 1 / mean(1 / x, axis)
64
65 -def kurtosis(x, axis=None):
66 """ 67 @param x: random variables 68 @param axis: compute the kurtosis along this axis 69 70 @return: Sample kurtosis of x 71 """ 72 from numpy import mean, std 73 74 m = x.mean(axis) 75 a = mean((x - m) ** 4, axis) 76 s = std(x, axis) 77 78 return a / s ** 4 - 3
79
80 -def skewness(x, axis=None):
81 """ 82 @param x: random variables 83 @param axis: compute the skewness along this axis 84 85 @return: Sample skewness of x 86 """ 87 from numpy import mean, std 88 89 s = std(x) 90 return mean((x - x.mean()) ** 3, axis) / s ** 3
91
92 -def autocorrelation(x, n):
93 """ 94 auto-correlation of a times series 95 96 @param x: time series 97 @type x: numpy.array 98 @param n: Maximal lag for which to compute the auto-correlation 99 @type n: int 100 """ 101 from numpy import array, mean, std 102 x = x - x.mean() 103 return array([mean(x[i:] * x[:len(x) - i]) for i in range(n)]) / std(x)**2
104
105 -def probabilistic_and(p, axis=0):
106 """ 107 Probabilistic version of AND 108 """ 109 from numpy import array, multiply 110 return multiply.reduce(array(p), axis=axis)
111
112 -def probabilistic_or(p, axis=0):
113 """ 114 Probabilistic version of OR 115 """ 116 from numpy import array 117 return 1 - probabilistic_and(1 - array(p), axis)
118
119 -def probabilistic_xor(p, axis=0):
120 """ 121 Probabilistic version of XOR. 122 Works only for axis=0. 123 """ 124 from numpy import array 125 126 p = array(p) 127 p_not = 1 - p 128 129 P = [] 130 131 for i in range(p.shape[axis]): 132 x = p_not * 1 133 x[i] = p[i] 134 P.append(probabilistic_and(x, 0)) 135 136 return probabilistic_or(P, 0)
137
138 -def principal_coordinates(D, nd=None):
139 """ 140 Reconstruction of a multidimensional configuration that 141 optimally reproduces the input distance matrix. 142 143 See: Gower, J (1966) 144 """ 145 from numpy import clip, sqrt, take, argsort, sort 146 from csb.numeric import reverse 147 from scipy.linalg import eigh 148 149 ## calculate centered similarity matrix 150 151 B = -clip(D, 1e-150, 1e150) ** 2 / 2. 152 153 b = B.mean(0) 154 155 B = B - b 156 B = (B.T - b).T 157 B += b.mean() 158 159 ## calculate spectral decomposition 160 161 v, U = eigh(B) 162 v = v.real 163 U = U.real 164 165 U = take(U, argsort(v), 1) 166 v = sort(v) 167 168 U = reverse(U, 1) 169 v = reverse(v) 170 171 if nd is None: nd = len(v) 172 173 X = U[:, :nd] * sqrt(clip(v[:nd], 0., 1e300)) 174 175 return X
176
177 -def entropy(p):
178 """ 179 Calculate the entropy of p. 180 @return: entropy of p 181 """ 182 from csb.numeric import log 183 from numpy import sum 184 185 return -sum(p * log(p))
186
187 -def histogram2D(x, nbins=100, axes=None, nbatch=1000, normalize=True):
188 """ 189 Non-greedy two-dimensional histogram. 190 191 @param x: input array of rank two 192 @type x: numpy array 193 @param nbins: number of bins 194 @type nbins: integer 195 @param axes: x- and y-axes used for binning the data (if provided this will be used instead of <nbins>) 196 @type axes: tuple of two one-dimensional numpy arrays 197 @param nbatch: size of batch that is used to sort the data into the 2D grid 198 @type nbatch: integer 199 @param normalize: specifies whether histogram should be normalized 200 @type normalize: boolean 201 202 @return: 2-rank array storing histogram, tuple of x- and y-axis 203 """ 204 from numpy import linspace, zeros, argmin, fabs, subtract, transpose 205 206 if axes is None: 207 208 lower, upper = x.min(0), x.max(0) 209 axes = [linspace(lower[i], upper[i], nbins) for i in range(lower.shape[0])] 210 211 H = zeros((len(axes[0]), len(axes[1]))) 212 213 while len(x): 214 215 y = x[:nbatch] 216 x = x[nbatch:] 217 218 I = transpose([argmin(fabs(subtract.outer(y[:, i], axes[i])), 1) for i in range(2)]) 219 220 for i, j in I: H[i, j] += 1 221 222 if normalize: 223 H = H / H.sum() / (axes[0][1] - axes[0][0]) / (axes[1][1] - axes[1][0]) 224 225 return H, axes
226
227 -def histogram_nd(x, nbins=100, axes=None, nbatch=1000, normalize=True):
228 """ 229 Non-greedy n-dimemsional histogram. 230 231 @param x: input array of rank (-1,n) 232 @type x: numpy array 233 @param nbins: number of bins 234 @type nbins: integer 235 @param axes: axes used for binning the data (if provided this will be used instead of <nbins>) 236 @type axes: tuple of two one-dimensional numpy arrays 237 @param nbatch: size of batch that is used to sort the data into the nD grid 238 @type nbatch: integer 239 @param normalize: specifies whether histogram should be normalized 240 @type normalize: boolean 241 242 @return: n-rank array storing histogram, tuple of axes 243 """ 244 import numpy as np 245 246 if len(x.shape) == 1: 247 x = np.reshape(x, (-1,1)) 248 249 d = x.shape[1] 250 251 if axes is None: 252 253 lower, upper = x.min(0), x.max(0) 254 axes = [np.linspace(lower[i], upper[i], nbins) for i in range(d)] 255 256 shape = tuple(map(len, axes)) 257 258 H = np.zeros(shape) 259 s = np.multiply.accumulate(np.array((1,) + H.shape[:-1]))[::-1] 260 H = H.flatten() 261 262 while len(x): 263 264 y = x[:nbatch] 265 x = x[nbatch:] 266 267 I = np.transpose([np.argmin(np.fabs(np.subtract.outer(y[:, i], axes[i])), 1) 268 for i in range(d)]) 269 I = np.dot(I, s) 270 I = np.sort(I) 271 272 i = list(set(I.tolist())) 273 n = np.equal.outer(I, i).sum(0) 274 275 H[i] += n 276 277 if normalize: 278 H = H / H.sum() / np.multiply.reduce([axes[i][1] - axes[i][0] for i in range(d)]) 279 280 H = np.reshape(H, shape) 281 282 return H, axes
283
284 -def density(x, nbins, normalize=True):
285 """ 286 Histogram of univariate input data: basically calls numpy's histogram method and 287 does a proper normalization. 288 289 @param x: input numpy array 290 @param nbins: number of bins 291 @type nbins: integer 292 @param normalize: if true, histogram will be normalized 293 """ 294 from numpy import histogram 295 296 hy, hx = histogram(x, nbins) 297 hx = 0.5 * (hx[1:] + hx[:-1]) 298 hy = hy.astype('d') 299 if normalize: 300 hy /= (hx[1] - hx[0]) * hy.sum() 301 302 return hx, hy
303
304 -def circvar(a, axis=None):
305 """ 306 Calculate circular variance of a circular variable. 307 308 @param a: input array 309 @param axis: axis along which mean is calculated 310 @type axis: None or integer 311 """ 312 from numpy import average, cos, sin 313 return 1 - average(cos(a), axis) ** 2 - average(sin(a), axis) ** 2
314
315 -def circmean(a, axis=None):
316 """ 317 Estimate mean of a circular variable 318 319 @param a: input array 320 @param axis: axis along which mean is calculated 321 @type axis: None or integer 322 """ 323 from numpy import sin, cos, arctan2, average 324 return arctan2(average(sin(a), axis), average(cos(a), axis))
325
326 -def running_average(x, w, axis=None):
327 """ 328 Calculates a running average for given window size 329 330 @param x: input array 331 @param w: window size 332 @type w: integer 333 @param axis: axis along which mean is calculated 334 """ 335 from numpy import array, mean 336 return array([mean(x[i:i + w], axis) for i in range(len(x) - w)])
337
338 -def weighted_median(x, w):
339 """ 340 Calculates the weighted median, that is the minimizer of 341 argmin {\sum w_i |x_i - \mu|} 342 343 @param x: input array 344 @param w: array of weights 345 """ 346 from numpy import sum, add, argsort, sort 347 348 w = w / w.sum() 349 w = w[argsort(x)] 350 x = sort(x) 351 j = sum(add.accumulate(w) < 0.5) 352 353 return x[j]
354