Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/stats/_binned_statistic.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import builtins
2import numpy as np
3from numpy.testing import suppress_warnings
4from operator import index
5from collections import namedtuple
7__all__ = ['binned_statistic',
8 'binned_statistic_2d',
9 'binned_statistic_dd']
12BinnedStatisticResult = namedtuple('BinnedStatisticResult',
13 ('statistic', 'bin_edges', 'binnumber'))
16def binned_statistic(x, values, statistic='mean',
17 bins=10, range=None):
18 """
19 Compute a binned statistic for one or more sets of data.
21 This is a generalization of a histogram function. A histogram divides
22 the space into bins, and returns the count of the number of points in
23 each bin. This function allows the computation of the sum, mean, median,
24 or other statistic of the values (or set of values) within each bin.
26 Parameters
27 ----------
28 x : (N,) array_like
29 A sequence of values to be binned.
30 values : (N,) array_like or list of (N,) array_like
31 The data on which the statistic will be computed. This must be
32 the same shape as `x`, or a set of sequences - each the same shape as
33 `x`. If `values` is a set of sequences, the statistic will be computed
34 on each independently.
35 statistic : string or callable, optional
36 The statistic to compute (default is 'mean').
37 The following statistics are available:
39 * 'mean' : compute the mean of values for points within each bin.
40 Empty bins will be represented by NaN.
41 * 'std' : compute the standard deviation within each bin. This
42 is implicitly calculated with ddof=0.
43 * 'median' : compute the median of values for points within each
44 bin. Empty bins will be represented by NaN.
45 * 'count' : compute the count of points within each bin. This is
46 identical to an unweighted histogram. `values` array is not
47 referenced.
48 * 'sum' : compute the sum of values for points within each bin.
49 This is identical to a weighted histogram.
50 * 'min' : compute the minimum of values for points within each bin.
51 Empty bins will be represented by NaN.
52 * 'max' : compute the maximum of values for point within each bin.
53 Empty bins will be represented by NaN.
54 * function : a user-defined function which takes a 1D array of
55 values, and outputs a single numerical statistic. This function
56 will be called on the values in each bin. Empty bins will be
57 represented by function([]), or NaN if this returns an error.
59 bins : int or sequence of scalars, optional
60 If `bins` is an int, it defines the number of equal-width bins in the
61 given range (10 by default). If `bins` is a sequence, it defines the
62 bin edges, including the rightmost edge, allowing for non-uniform bin
63 widths. Values in `x` that are smaller than lowest bin edge are
64 assigned to bin number 0, values beyond the highest bin are assigned to
65 ``bins[-1]``. If the bin edges are specified, the number of bins will
66 be, (nx = len(bins)-1).
67 range : (float, float) or [(float, float)], optional
68 The lower and upper range of the bins. If not provided, range
69 is simply ``(x.min(), x.max())``. Values outside the range are
70 ignored.
72 Returns
73 -------
74 statistic : array
75 The values of the selected statistic in each bin.
76 bin_edges : array of dtype float
77 Return the bin edges ``(length(statistic)+1)``.
78 binnumber: 1-D ndarray of ints
79 Indices of the bins (corresponding to `bin_edges`) in which each value
80 of `x` belongs. Same length as `values`. A binnumber of `i` means the
81 corresponding value is between (bin_edges[i-1], bin_edges[i]).
83 See Also
84 --------
85 numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
87 Notes
88 -----
89 All but the last (righthand-most) bin is half-open. In other words, if
90 `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
91 but excluding 2) and the second ``[2, 3)``. The last bin, however, is
92 ``[3, 4]``, which *includes* 4.
94 .. versionadded:: 0.11.0
96 Examples
97 --------
98 >>> from scipy import stats
99 >>> import matplotlib.pyplot as plt
101 First some basic examples:
103 Create two evenly spaced bins in the range of the given sample, and sum the
104 corresponding values in each of those bins:
106 >>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
107 >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
108 BinnedStatisticResult(statistic=array([4. , 4.5]),
109 bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
111 Multiple arrays of values can also be passed. The statistic is calculated
112 on each set independently:
114 >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
115 >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
116 BinnedStatisticResult(statistic=array([[4. , 4.5],
117 [8. , 9. ]]), bin_edges=array([1., 4., 7.]),
118 binnumber=array([1, 1, 1, 2, 2]))
120 >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
121 ... bins=3)
122 BinnedStatisticResult(statistic=array([1., 2., 4.]),
123 bin_edges=array([1., 2., 3., 4.]),
124 binnumber=array([1, 2, 1, 2, 3]))
126 As a second example, we now generate some random data of sailing boat speed
127 as a function of wind speed, and then determine how fast our boat is for
128 certain wind speeds:
130 >>> windspeed = 8 * np.random.rand(500)
131 >>> boatspeed = .3 * windspeed**.5 + .2 * np.random.rand(500)
132 >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
133 ... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
134 >>> plt.figure()
135 >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
136 >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
137 ... label='binned statistic of data')
138 >>> plt.legend()
140 Now we can use ``binnumber`` to select all datapoints with a windspeed
141 below 1:
143 >>> low_boatspeed = boatspeed[binnumber == 0]
145 As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
146 plot of a distribution that shows the mean and distribution around that
147 mean per bin, on top of a regular histogram and the probability
148 distribution function:
150 >>> x = np.linspace(0, 5, num=500)
151 >>> x_pdf = stats.maxwell.pdf(x)
152 >>> samples = stats.maxwell.rvs(size=10000)
154 >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
155 ... statistic='mean', bins=25)
156 >>> bin_width = (bin_edges[1] - bin_edges[0])
157 >>> bin_centers = bin_edges[1:] - bin_width/2
159 >>> plt.figure()
160 >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
161 ... alpha=0.2, label='histogram of data')
162 >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
163 >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
164 ... label='binned statistic of data')
165 >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
166 >>> plt.legend(fontsize=10)
167 >>> plt.show()
169 """
170 try:
171 N = len(bins)
172 except TypeError:
173 N = 1
175 if N != 1:
176 bins = [np.asarray(bins, float)]
178 if range is not None:
179 if len(range) == 2:
180 range = [range]
182 medians, edges, binnumbers = binned_statistic_dd(
183 [x], values, statistic, bins, range)
185 return BinnedStatisticResult(medians, edges[0], binnumbers)
188BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
189 ('statistic', 'x_edge', 'y_edge',
190 'binnumber'))
193def binned_statistic_2d(x, y, values, statistic='mean',
194 bins=10, range=None, expand_binnumbers=False):
195 """
196 Compute a bidimensional binned statistic for one or more sets of data.
198 This is a generalization of a histogram2d function. A histogram divides
199 the space into bins, and returns the count of the number of points in
200 each bin. This function allows the computation of the sum, mean, median,
201 or other statistic of the values (or set of values) within each bin.
203 Parameters
204 ----------
205 x : (N,) array_like
206 A sequence of values to be binned along the first dimension.
207 y : (N,) array_like
208 A sequence of values to be binned along the second dimension.
209 values : (N,) array_like or list of (N,) array_like
210 The data on which the statistic will be computed. This must be
211 the same shape as `x`, or a list of sequences - each with the same
212 shape as `x`. If `values` is such a list, the statistic will be
213 computed on each independently.
214 statistic : string or callable, optional
215 The statistic to compute (default is 'mean').
216 The following statistics are available:
218 * 'mean' : compute the mean of values for points within each bin.
219 Empty bins will be represented by NaN.
220 * 'std' : compute the standard deviation within each bin. This
221 is implicitly calculated with ddof=0.
222 * 'median' : compute the median of values for points within each
223 bin. Empty bins will be represented by NaN.
224 * 'count' : compute the count of points within each bin. This is
225 identical to an unweighted histogram. `values` array is not
226 referenced.
227 * 'sum' : compute the sum of values for points within each bin.
228 This is identical to a weighted histogram.
229 * 'min' : compute the minimum of values for points within each bin.
230 Empty bins will be represented by NaN.
231 * 'max' : compute the maximum of values for point within each bin.
232 Empty bins will be represented by NaN.
233 * function : a user-defined function which takes a 1D array of
234 values, and outputs a single numerical statistic. This function
235 will be called on the values in each bin. Empty bins will be
236 represented by function([]), or NaN if this returns an error.
238 bins : int or [int, int] or array_like or [array, array], optional
239 The bin specification:
241 * the number of bins for the two dimensions (nx = ny = bins),
242 * the number of bins in each dimension (nx, ny = bins),
243 * the bin edges for the two dimensions (x_edge = y_edge = bins),
244 * the bin edges in each dimension (x_edge, y_edge = bins).
246 If the bin edges are specified, the number of bins will be,
247 (nx = len(x_edge)-1, ny = len(y_edge)-1).
249 range : (2,2) array_like, optional
250 The leftmost and rightmost edges of the bins along each dimension
251 (if not specified explicitly in the `bins` parameters):
252 [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
253 considered outliers and not tallied in the histogram.
254 expand_binnumbers : bool, optional
255 'False' (default): the returned `binnumber` is a shape (N,) array of
256 linearized bin indices.
257 'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
258 ndarray, where each row gives the bin numbers in the corresponding
259 dimension.
260 See the `binnumber` returned value, and the `Examples` section.
262 .. versionadded:: 0.17.0
264 Returns
265 -------
266 statistic : (nx, ny) ndarray
267 The values of the selected statistic in each two-dimensional bin.
268 x_edge : (nx + 1) ndarray
269 The bin edges along the first dimension.
270 y_edge : (ny + 1) ndarray
271 The bin edges along the second dimension.
272 binnumber : (N,) array of ints or (2,N) ndarray of ints
273 This assigns to each element of `sample` an integer that represents the
274 bin in which this observation falls. The representation depends on the
275 `expand_binnumbers` argument. See `Notes` for details.
278 See Also
279 --------
280 numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
282 Notes
283 -----
284 Binedges:
285 All but the last (righthand-most) bin is half-open. In other words, if
286 `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
287 but excluding 2) and the second ``[2, 3)``. The last bin, however, is
288 ``[3, 4]``, which *includes* 4.
290 `binnumber`:
291 This returned argument assigns to each element of `sample` an integer that
292 represents the bin in which it belongs. The representation depends on the
293 `expand_binnumbers` argument. If 'False' (default): The returned
294 `binnumber` is a shape (N,) array of linearized indices mapping each
295 element of `sample` to its corresponding bin (using row-major ordering).
296 If 'True': The returned `binnumber` is a shape (2,N) ndarray where
297 each row indicates bin placements for each dimension respectively. In each
298 dimension, a binnumber of `i` means the corresponding value is between
299 (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
301 .. versionadded:: 0.11.0
303 Examples
304 --------
305 >>> from scipy import stats
307 Calculate the counts with explicit bin-edges:
309 >>> x = [0.1, 0.1, 0.1, 0.6]
310 >>> y = [2.1, 2.6, 2.1, 2.1]
311 >>> binx = [0.0, 0.5, 1.0]
312 >>> biny = [2.0, 2.5, 3.0]
313 >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
314 >>> ret.statistic
315 array([[2., 1.],
316 [1., 0.]])
318 The bin in which each sample is placed is given by the `binnumber`
319 returned parameter. By default, these are the linearized bin indices:
321 >>> ret.binnumber
322 array([5, 6, 5, 9])
324 The bin indices can also be expanded into separate entries for each
325 dimension using the `expand_binnumbers` parameter:
327 >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
328 ... expand_binnumbers=True)
329 >>> ret.binnumber
330 array([[1, 1, 1, 2],
331 [1, 2, 1, 1]])
333 Which shows that the first three elements belong in the xbin 1, and the
334 fourth into xbin 2; and so on for y.
336 """
338 # This code is based on np.histogram2d
339 try:
340 N = len(bins)
341 except TypeError:
342 N = 1
344 if N != 1 and N != 2:
345 xedges = yedges = np.asarray(bins, float)
346 bins = [xedges, yedges]
348 medians, edges, binnumbers = binned_statistic_dd(
349 [x, y], values, statistic, bins, range,
350 expand_binnumbers=expand_binnumbers)
352 return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
355BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
356 ('statistic', 'bin_edges',
357 'binnumber'))
360def binned_statistic_dd(sample, values, statistic='mean',
361 bins=10, range=None, expand_binnumbers=False,
362 binned_statistic_result=None):
363 """
364 Compute a multidimensional binned statistic for a set of data.
366 This is a generalization of a histogramdd function. A histogram divides
367 the space into bins, and returns the count of the number of points in
368 each bin. This function allows the computation of the sum, mean, median,
369 or other statistic of the values within each bin.
371 Parameters
372 ----------
373 sample : array_like
374 Data to histogram passed as a sequence of N arrays of length D, or
375 as an (N,D) array.
376 values : (N,) array_like or list of (N,) array_like
377 The data on which the statistic will be computed. This must be
378 the same shape as `sample`, or a list of sequences - each with the
379 same shape as `sample`. If `values` is such a list, the statistic
380 will be computed on each independently.
381 statistic : string or callable, optional
382 The statistic to compute (default is 'mean').
383 The following statistics are available:
385 * 'mean' : compute the mean of values for points within each bin.
386 Empty bins will be represented by NaN.
387 * 'median' : compute the median of values for points within each
388 bin. Empty bins will be represented by NaN.
389 * 'count' : compute the count of points within each bin. This is
390 identical to an unweighted histogram. `values` array is not
391 referenced.
392 * 'sum' : compute the sum of values for points within each bin.
393 This is identical to a weighted histogram.
394 * 'std' : compute the standard deviation within each bin. This
395 is implicitly calculated with ddof=0. If the number of values
396 within a given bin is 0 or 1, the computed standard deviation value
397 will be 0 for the bin.
398 * 'min' : compute the minimum of values for points within each bin.
399 Empty bins will be represented by NaN.
400 * 'max' : compute the maximum of values for point within each bin.
401 Empty bins will be represented by NaN.
402 * function : a user-defined function which takes a 1D array of
403 values, and outputs a single numerical statistic. This function
404 will be called on the values in each bin. Empty bins will be
405 represented by function([]), or NaN if this returns an error.
407 bins : sequence or positive int, optional
408 The bin specification must be in one of the following forms:
410 * A sequence of arrays describing the bin edges along each dimension.
411 * The number of bins for each dimension (nx, ny, ... = bins).
412 * The number of bins for all dimensions (nx = ny = ... = bins).
413 range : sequence, optional
414 A sequence of lower and upper bin edges to be used if the edges are
415 not given explicitly in `bins`. Defaults to the minimum and maximum
416 values along each dimension.
417 expand_binnumbers : bool, optional
418 'False' (default): the returned `binnumber` is a shape (N,) array of
419 linearized bin indices.
420 'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
421 ndarray, where each row gives the bin numbers in the corresponding
422 dimension.
423 See the `binnumber` returned value, and the `Examples` section of
424 `binned_statistic_2d`.
425 binned_statistic_result : binnedStatisticddResult
426 Result of a previous call to the function in order to reuse bin edges
427 and bin numbers with new values and/or a different statistic.
428 To reuse bin numbers, `expand_binnumbers` must have been set to False
429 (the default)
431 .. versionadded:: 0.17.0
433 Returns
434 -------
435 statistic : ndarray, shape(nx1, nx2, nx3,...)
436 The values of the selected statistic in each two-dimensional bin.
437 bin_edges : list of ndarrays
438 A list of D arrays describing the (nxi + 1) bin edges for each
439 dimension.
440 binnumber : (N,) array of ints or (D,N) ndarray of ints
441 This assigns to each element of `sample` an integer that represents the
442 bin in which this observation falls. The representation depends on the
443 `expand_binnumbers` argument. See `Notes` for details.
446 See Also
447 --------
448 numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
450 Notes
451 -----
452 Binedges:
453 All but the last (righthand-most) bin is half-open in each dimension. In
454 other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
455 ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
456 last bin, however, is ``[3, 4]``, which *includes* 4.
458 `binnumber`:
459 This returned argument assigns to each element of `sample` an integer that
460 represents the bin in which it belongs. The representation depends on the
461 `expand_binnumbers` argument. If 'False' (default): The returned
462 `binnumber` is a shape (N,) array of linearized indices mapping each
463 element of `sample` to its corresponding bin (using row-major ordering).
464 If 'True': The returned `binnumber` is a shape (D,N) ndarray where
465 each row indicates bin placements for each dimension respectively. In each
466 dimension, a binnumber of `i` means the corresponding value is between
467 (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
469 .. versionadded:: 0.11.0
471 Examples
472 --------
473 >>> from scipy import stats
474 >>> import matplotlib.pyplot as plt
475 >>> from mpl_toolkits.mplot3d import Axes3D
477 Take an array of 600 (x, y) coordinates as an example.
478 `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
479 of dimension `D+1` is required.
481 >>> mu = np.array([0., 1.])
482 >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
483 >>> multinormal = stats.multivariate_normal(mu, sigma)
484 >>> data = multinormal.rvs(size=600, random_state=235412)
485 >>> data.shape
486 (600, 2)
488 Create bins and count how many arrays fall in each bin:
490 >>> N = 60
491 >>> x = np.linspace(-3, 3, N)
492 >>> y = np.linspace(-3, 4, N)
493 >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
494 ... statistic='count')
495 >>> bincounts = ret.statistic
497 Set the volume and the location of bars:
499 >>> dx = x[1] - x[0]
500 >>> dy = y[1] - y[0]
501 >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
502 >>> z = 0
504 >>> bincounts = bincounts.ravel()
505 >>> x = x.ravel()
506 >>> y = y.ravel()
508 >>> fig = plt.figure()
509 >>> ax = fig.add_subplot(111, projection='3d')
510 >>> with np.errstate(divide='ignore'): # silence random axes3d warning
511 ... ax.bar3d(x, y, z, dx, dy, bincounts)
513 Reuse bin numbers and bin edges with new values:
515 >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
516 ... binned_statistic_result=ret,
517 ... statistic='mean')
518 """
519 known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
520 if not callable(statistic) and statistic not in known_stats:
521 raise ValueError('invalid statistic %r' % (statistic,))
523 try:
524 bins = index(bins)
525 except TypeError:
526 # bins is not an integer
527 pass
528 # If bins was an integer-like object, now it is an actual Python int.
530 # NOTE: for _bin_edges(), see e.g. gh-11365
531 if isinstance(bins, int) and not np.isfinite(sample).all():
532 raise ValueError('%r contains non-finite values.' % (sample,))
534 # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
535 # `Dlen` is the length of elements along each dimension.
536 # This code is based on np.histogramdd
537 try:
538 # `sample` is an ND-array.
539 Dlen, Ndim = sample.shape
540 except (AttributeError, ValueError):
541 # `sample` is a sequence of 1D arrays.
542 sample = np.atleast_2d(sample).T
543 Dlen, Ndim = sample.shape
545 # Store initial shape of `values` to preserve it in the output
546 values = np.asarray(values)
547 input_shape = list(values.shape)
548 # Make sure that `values` is 2D to iterate over rows
549 values = np.atleast_2d(values)
550 Vdim, Vlen = values.shape
552 # Make sure `values` match `sample`
553 if(statistic != 'count' and Vlen != Dlen):
554 raise AttributeError('The number of `values` elements must match the '
555 'length of each `sample` dimension.')
557 try:
558 M = len(bins)
559 if M != Ndim:
560 raise AttributeError('The dimension of bins must be equal '
561 'to the dimension of the sample x.')
562 except TypeError:
563 bins = Ndim * [bins]
565 if binned_statistic_result is None:
566 nbin, edges, dedges = _bin_edges(sample, bins, range)
567 binnumbers = _bin_numbers(sample, nbin, edges, dedges)
568 else:
569 edges = binned_statistic_result.bin_edges
570 nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
571 # +1 for outlier bins
572 dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
573 binnumbers = binned_statistic_result.binnumber
575 result = np.empty([Vdim, nbin.prod()], float)
577 if statistic == 'mean':
578 result.fill(np.nan)
579 flatcount = np.bincount(binnumbers, None)
580 a = flatcount.nonzero()
581 for vv in builtins.range(Vdim):
582 flatsum = np.bincount(binnumbers, values[vv])
583 result[vv, a] = flatsum[a] / flatcount[a]
584 elif statistic == 'std':
585 result.fill(0)
586 flatcount = np.bincount(binnumbers, None)
587 a = flatcount.nonzero()
588 for vv in builtins.range(Vdim):
589 for i in np.unique(binnumbers):
590 # NOTE: take std dev by bin, np.std() is 2-pass and stable
591 binned_data = values[vv, binnumbers == i]
592 # calc std only when binned data is 2 or more for speed up.
593 if len(binned_data) >= 2:
594 result[vv, i] = np.std(binned_data)
595 elif statistic == 'count':
596 result.fill(0)
597 flatcount = np.bincount(binnumbers, None)
598 a = np.arange(len(flatcount))
599 result[:, a] = flatcount[np.newaxis, :]
600 elif statistic == 'sum':
601 result.fill(0)
602 for vv in builtins.range(Vdim):
603 flatsum = np.bincount(binnumbers, values[vv])
604 a = np.arange(len(flatsum))
605 result[vv, a] = flatsum
606 elif statistic == 'median':
607 result.fill(np.nan)
608 for i in np.unique(binnumbers):
609 for vv in builtins.range(Vdim):
610 result[vv, i] = np.median(values[vv, binnumbers == i])
611 elif statistic == 'min':
612 result.fill(np.nan)
613 for i in np.unique(binnumbers):
614 for vv in builtins.range(Vdim):
615 result[vv, i] = np.min(values[vv, binnumbers == i])
616 elif statistic == 'max':
617 result.fill(np.nan)
618 for i in np.unique(binnumbers):
619 for vv in builtins.range(Vdim):
620 result[vv, i] = np.max(values[vv, binnumbers == i])
621 elif callable(statistic):
622 with np.errstate(invalid='ignore'), suppress_warnings() as sup:
623 sup.filter(RuntimeWarning)
624 try:
625 null = statistic([])
626 except Exception:
627 null = np.nan
628 result.fill(null)
629 for i in np.unique(binnumbers):
630 for vv in builtins.range(Vdim):
631 result[vv, i] = statistic(values[vv, binnumbers == i])
633 # Shape into a proper matrix
634 result = result.reshape(np.append(Vdim, nbin))
636 # Remove outliers (indices 0 and -1 for each bin-dimension).
637 core = tuple([slice(None)] + Ndim * [slice(1, -1)])
638 result = result[core]
640 # Unravel binnumbers into an ndarray, each row the bins for each dimension
641 if(expand_binnumbers and Ndim > 1):
642 binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
644 if np.any(result.shape[1:] != nbin - 2):
645 raise RuntimeError('Internal Shape Error')
647 # Reshape to have output (`result`) match input (`values`) shape
648 result = result.reshape(input_shape[:-1] + list(nbin-2))
650 return BinnedStatisticddResult(result, edges, binnumbers)
653def _bin_edges(sample, bins=None, range=None):
654 """ Create edge arrays
655 """
656 Dlen, Ndim = sample.shape
658 nbin = np.empty(Ndim, int) # Number of bins in each dimension
659 edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
660 dedges = Ndim * [None] # Spacing between edges (will be 2D array)
662 # Select range for each dimension
663 # Used only if number of bins is given.
664 if range is None:
665 smin = np.atleast_1d(np.array(sample.min(axis=0), float))
666 smax = np.atleast_1d(np.array(sample.max(axis=0), float))
667 else:
668 smin = np.zeros(Ndim)
669 smax = np.zeros(Ndim)
670 for i in builtins.range(Ndim):
671 smin[i], smax[i] = range[i]
673 # Make sure the bins have a finite width.
674 for i in builtins.range(len(smin)):
675 if smin[i] == smax[i]:
676 smin[i] = smin[i] - .5
677 smax[i] = smax[i] + .5
679 # Create edge arrays
680 for i in builtins.range(Ndim):
681 if np.isscalar(bins[i]):
682 nbin[i] = bins[i] + 2 # +2 for outlier bins
683 edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1)
684 else:
685 edges[i] = np.asarray(bins[i], float)
686 nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
687 dedges[i] = np.diff(edges[i])
689 nbin = np.asarray(nbin)
691 return nbin, edges, dedges
694def _bin_numbers(sample, nbin, edges, dedges):
695 """Compute the bin number each sample falls into, in each dimension
696 """
697 Dlen, Ndim = sample.shape
699 sampBin = [
700 np.digitize(sample[:, i], edges[i])
701 for i in range(Ndim)
702 ]
704 # Using `digitize`, values that fall on an edge are put in the right bin.
705 # For the rightmost bin, we want values equal to the right
706 # edge to be counted in the last bin, and not as an outlier.
707 for i in range(Ndim):
708 # Find the rounding precision
709 dedges_min = dedges[i].min()
710 if dedges_min == 0:
711 raise ValueError('The smallest edge difference is numerically 0.')
712 decimal = int(-np.log10(dedges_min)) + 6
713 # Find which points are on the rightmost edge.
714 on_edge = np.where(np.around(sample[:, i], decimal) ==
715 np.around(edges[i][-1], decimal))[0]
716 # Shift these points one bin to the left.
717 sampBin[i][on_edge] -= 1
719 # Compute the sample indices in the flattened statistic matrix.
720 binnumbers = np.ravel_multi_index(sampBin, nbin)
722 return binnumbers