acgc.stats.weighted

Weighted variance, covariance, correlation, median, and quantiles

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3""" Weighted variance, covariance, correlation, median, and quantiles
  4"""
  5
  6import numpy as np
  7from sklearn.covariance import MinCovDet
  8from scipy.interpolate import interp1d
  9
 10__all__ = [
 11    'wcorr',
 12    'wcorrcoef',
 13    'wcov',
 14    'wmean',
 15    'wmedian',
 16    'wquantile',
 17    'wstd',
 18    'wvar'
 19]
 20
 21def wmean(x,w=None,robust=False):
 22    '''Weighted mean 
 23    
 24    Calculate the mean of x using weights w.
 25    
 26    Parameters
 27    ----------
 28    x : array_like 
 29        values to be averaged
 30    w : array_like, default=None
 31        weights for each element of x; can be ommitted if robust=True
 32    robust : bool, default=False
 33        if robust=True, weights will be internally calculated using FastMCD;
 34        ignored if w is used
 35        
 36    Returns
 37    -------
 38    float
 39        weighted mean of x 
 40    '''
 41    if w is None:
 42        if robust:
 43            # Use FastMCD to calculate weights; Another method could be used here
 44            w = MinCovDet().fit( np.array([x,x]).T ).support_
 45        else:
 46            raise ValueError('must specify weights w or select robust=True')
 47    else:
 48        assert len(w) == len(x), 'w must be the same length as x'
 49
 50    return np.sum( x * w ) / np.sum(w)
 51
 52def wstd(x,w=None,ddof=1,robust=False):
 53    '''Weighted standard deviation
 54    
 55    Calculate the standard deviation of x using weights w. If ddof=1 (default),
 56    then the result is the unbiased (sample) standard deviation when w=1.
 57    
 58    Parameters
 59    ----------
 60    x : array_like 
 61        values to be analyzed
 62    w : array_like, default=None
 63        weights for each element of x; can be ommitted if robust=True
 64    ddof : int, default=1
 65        differential degrees of freedom. See note above.
 66    robust : bool, default=False
 67        if robust=True, weights will be internally calculated using FastMCD;
 68        ignored if w is used
 69        
 70    Returns
 71    -------
 72    float
 73        weighted standard deviation of x 
 74    '''
 75    if w is None:
 76        if robust:
 77            # Use FastMCD to calculate weights; Another method could be used here
 78            w = MinCovDet().fit( np.array([x,x]).T ).support_
 79        else:
 80            raise ValueError('must specify weights w or select robust=True')
 81    else:
 82        assert len(w) == len(x), 'w must be the same length as x'
 83
 84    return np.sqrt( wcov(x,x,w,ddof,robust) )
 85
 86def wvar(x,w=None,ddof=1,robust=False):
 87    '''Weighted variance 
 88    
 89    Calculate the variance of x using weights w. If ddof=1 (default),
 90    then the result is the unbiased (sample) variance when w=1.
 91    
 92    Parameters
 93    ----------
 94    x : array_like 
 95        values to be analyzed
 96    w : array_like, default=None
 97        weights for each element of x; can be ommitted if robust=True
 98    ddof : int, default=1
 99        differential degrees of freedom. See note above.
100    robust : bool, default=False
101        if robust=True, weights will be internally calculated using FastMCD;
102        ignored if w is used
103        
104    Returns
105    -------
106    float
107        weighted variance of x 
108    '''
109    if w is None:
110        if robust:
111            # Use FastMCD to calculate weights; Another method could be used here
112            w = MinCovDet().fit( np.array([x,x]).T ).support_
113        else:
114            raise ValueError('must specify weights w or select robust=True')
115    else:
116        assert len(w) == len(x), 'w must be the same length as x'
117
118    return wcov(x,x,w,ddof,robust)
119
120def wcov(x,y,w=None,ddof=1,robust=False):
121    '''Weighted covariance 
122    
123    Calculate the covariance of x and y using weights w. If ddof=1 (default),
124    then the result is the unbiased (sample) covariance when w=1.
125    
126    Implements weighted covariance as defined by NIST Dataplot 
127    https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
128    
129    Parameters
130    ----------
131    x, y : array_like 
132        values to be analyzed
133    w : array_like, default=None
134        weights for each element of x; can be ommitted if robust=True
135    ddof : int (default=1)
136        differential degrees of freedom. See note above.
137    robust : bool, default=False 
138        if robust=True, weights will be internally calculated using FastMCD;
139        ignored if w is used
140        
141    Returns
142    -------
143    float
144        weighted covariance of x and y
145    '''
146
147    assert len(y) == len(x), 'y must be the same length as x'
148
149    if w is None:
150        if robust:
151            # Use FastMCD to calculate weights; Another method could be used here
152            w = MinCovDet().fit( np.array([x,x]).T ).support_
153        else:
154            raise ValueError('must specify weights w or select robust=True')
155    else:
156        assert len(w) == len(x), 'w must be the same length as x'
157
158    w = _wscale(w)
159    nw = np.count_nonzero(w)
160
161    return np.sum( ( x - wmean(x,w) ) * ( y - wmean(y,w) ) * w ) / \
162        ( np.sum(w) / nw * (nw - ddof) )
163
164def wcorr(x,y,w=None,robust=False):
165    '''Weighted correlation coeffient
166    
167    Calculate the Pearson linear correlation coefficient of x and y using weights w. 
168    This is derived from the weighted covariance and weighted variance.
169    Equivalent to R2w in Eq. 4 from Willett and Singer (1988).
170    See notes below on interpretation.
171    
172    Parameters
173    ----------
174    x, y : array_like 
175        values to be analyzed
176    w : array_like, default=None
177        weights for each element of x; can be ommitted if robust=True
178    ddof : int, default=1
179        differential degrees of freedom. See note above.
180    robust : bool, default=False
181        if robust=True, weights will be internally calculated using FastMCD;
182        ignored if w is used
183        
184    Returns
185    -------
186    float
187        weighted correlation coefficient of x and y
188
189    Notes
190    -----
191    The weighted correlation coefficient here may be usedful to construct a 
192    a "weighted R2" or "weighted coefficient of determination" from a 
193    Weighted Least Squares regression: R2w = wcorr(x,y,w)**2. The resulting R2w is
194    equivalent to equation 4 from Willett and Singer (1988, American Statistician) 
195    and their caveats apply. 
196
197    Craig Milligan provides the following advice in an online forum:
198    "What is the meaning of this (corrected) weighted r-squared? 
199    Willett and Singer interpret it as: "the coefficient of determination in the 
200    transformed [weighted] dataset. It is a measure of the proportion of the 
201    variation in weighted Y that can be accounted for by weighted X, and is the 
202    quantity that is output as R2 by the major statistical computer packages when a 
203    WLS regression is performed".
204
205    Is it meaningful as a measure of goodness of fit? 
206    This depends on how it is presented and interpreted. Willett and Singer caution 
207    that it is typically quite a bit higher than the r-squared obtained in ordinary 
208    least squares regression, and the high value encourages prominent display... 
209    but this display may be deceptive IF it is interpreted in the conventional 
210    sense of r-squared (as the proportion of unweighted variation explained by a 
211    model). Willett and Singer propose that a less 'deceptive' alternative is 
212    their equation 7 [usual unweighted R2, called pseudor2wls by Willett and Singer]. 
213    In general, Willett and Singer also caution that it is 
214    not good to rely on any r2 (even their pseudor2wls) as a sole measure of 
215    goodness of fit. Despite these cautions, the whole premise of robust regression 
216    is that some cases are judged 'not as good' and don't count as much in the model 
217    fitting, and it may be good to reflect this in part of the model assessment 
218    process. The weighted r-squared described, can be one good measure of goodness 
219    of fit - as long as the correct interpretation is clearly given in the 
220    presentation and it is not relied on as the sole assessment of goodness of fit."
221
222    Willett, J. B. and Singer, J. D.: Another Cautionary Note About R2: 
223        Its Use in Weighted Least-Squares Regression-Analysis, American Statistician, 
224        42(3), 236–238, 1988.
225    '''
226
227    assert len(y) == len(x), 'y must be the same length as x'
228
229    if w is None:
230        if robust:
231            # Use FastMCD to calculate weights; Another method could be used here
232            w = MinCovDet().fit( np.array([x,x]).T ).support_
233        else:
234            raise ValueError('must specify weights w or select robust=True')
235    else:
236        assert len(w) == len(x), 'w must be the same length as x'
237
238    w = _wscale(w)
239    return wcov(x,y,w) / np.sqrt( wvar(x,w) * wvar(y,w) )
240
241# Alias for wcorr
242wcorrcoef = wcorr
243
244def _wscale(w):
245    '''Scale array to a maximum value of 1
246
247    Rescale array to a maximum value of 1. 
248    In weighted averaging, we will assume that a weight of 1 means 1 degree of 
249    freedom in the observations.
250    
251    Parameters
252    ----------
253    w : array_like
254        weights 
255        
256    Returns
257    -------
258    array_like
259        input array rescaled so the largest element is 1
260    '''
261    return w / np.max( w )
262
263def wmedian(x,w,**kwargs):
264    '''Weighted median
265
266    See documentation for `wquantile`
267
268    Parameters
269    ----------
270    x : array_like 
271        values to be analyzed
272    w : array_like
273        weights for each element of x, representing frequency
274    **kwargs passed to `wquantile`
275
276    Returns
277    -------
278    float
279        weighted median of x 
280
281    '''
282    return wquantile(x,0.5,w,**kwargs)
283
284def wquantile(x,q,w,interpolation='partition'):
285    '''Weighted quantile 
286    
287    Calculate the quantile q from data array x using data weights w.
288    If weights reflect the relative frequency of the elements of x in a large population,
289    then the weighted quantile result is equivalent to numpy.quantile or numpy.percentile 
290    operating on an array of that larger population (containing many repeated elements).
291
292    For uniform weights and interpolation=partition or partition0, this function 
293    differs from numpy.percentile. This behavior is expected and desirable.
294    The numpy.percentile behaviro can be reproduced here by using 
295    interpolation = linear, lower, higher, or nearest.
296    Note that numpy.quantile is equivalent to numpy.percentile(interpolation='linear')
297    
298    This naive algorithm is O(n) and may be slow for large samples (x).
299    Consider using Robustats or another optimized package.       
300    
301    Parameters
302    ----------
303    x : array_like (N,)
304        values to be analyzed
305    q : list (m,)
306        quantiles that will be calculated, in range 0-1
307    w : array_like (N,)
308        weights for each element of x. 
309        These represent the frequency of elements x in a large population
310    interpolation : {'partition' (default),'partition0','linear','nearest','lower','higher'}
311        This parameter specifies the interpolation method to use when the desired quantile 
312        lies bewteen elements i < j in x. Allowed values:
313        - 'partition': [default] choose the element of x that partitions the 
314                        sum of weights on either side to q and (1-q)
315                        When two elements both satisfy partition, then average them.
316                        This is the Edgeworth method (https://en.wikipedia.org/wiki/Weighted_median)
317        - 'partition0': Same as partition, but result is always an element of x (no averaging). 
318                        Instead return the element of x that partitions weights most closely 
319                        to q and (1-q) or, if there is still a tie, then the smaller element.
320        - 'linear'  : i + (j-1) * fraction. replicates behavior of numpy.quantile when all 
321                    weights are equal
322        - 'nearest' : i or j element that most closely divides data at the q quantile
323        - 'lower'   : i, the largest element <= the q quantile
324        - 'higher'  : j, the smallest element >= the q quantile
325                         
326    Result
327    ------
328    array (m,)
329        weighted quantiles of x. Length is the same as input q
330    '''
331
332    # Ensure arguments are arrays
333    x = np.asarray( x )
334    w = np.asarray( w )
335
336    # Number of elements
337    n = len(x)
338
339    # Ensure weights are same length as x
340    if len(w) != n:
341        raise ValueError( 'weights w must be the same length as array x')
342
343    # Ensure inputs are all finite, no NaN or Inf
344    if np.any( ~np.isfinite(x) ):
345        raise ValueError( 'Array x contains non-finite elements')
346    if np.any( ~np.isfinite(w) ):
347        raise ValueError( 'Weights w contains non-finite elements')
348
349    # Sort x from smallest to largest
350    idx = np.argsort( x )
351
352    if interpolation in ['partition','partition2']:
353
354        # To calculate multiple quantiles, call function iteratively for each quantile requested
355        if isinstance(q, (list, tuple, np.ndarray)):
356            return [wquantile(x,qi,w,interpolation) for qi in q]
357
358        # Cumulative sum of weights, divided by sum
359        wsum = np.cumsum( w[idx] ) / np.sum( w )
360        # Reverse cumulative sum (cumulative sum of elements in reverse order)
361        wsumr = np.cumsum( w[idx][::-1] )[::-1] / np.sum( w )
362
363        # Lower bound for quantile; il is an index into the sorted array
364        if q <= wsum[0]:
365            il = 0
366        else:
367            il   = np.flatnonzero( wsum < q )[-1] + 1
368
369        # Upper bound for quantile; iu is an index into the sorted array
370        if (1-q) <= wsumr[-1]:
371            iu = n-1
372        else:
373            iu   = np.flatnonzero( wsumr < (1-q) )[0] - 1
374
375        if il == iu:
376            # Upper and lower bounds are the same; we're done
377            xq = x[idx[il]]
378
379        else:
380            # Several methods for reconciling different upper and lower bounds
381
382            if interpolation == 'partition':
383                # Average the upper and lower bounds
384                # This creates an element not found in the input array,
385                # which may be inappropriate in some cases
386                xq = np.mean( x[idx[[il,iu]]] )
387
388            else:
389                # Choose the element with the smaller weight
390                # This guarantees that the value is an element of the input array
391                if w[idx[il]] <= w[idx[iu]]:
392                    iq = il
393                else:
394                    iq = iu
395                xq = x[idx[iq]]
396
397    else:
398
399        # These methods give the same results as numpy.percentile when using
400        # the same interpolation method and uniform weights.
401
402        # Define the quantile for each element in x
403        w         = w.astype(np.float32)
404        qx        = w[idx] / 2 - w[idx][0] / 2
405        qx[1:]   += np.cumsum( w[idx] )[:-1]
406        qx       /= qx[-1]
407
408        # Interpolate to get quantile value
409        if interpolation == 'linear':
410            f = interp1d(qx,x[idx],kind='linear')
411        elif interpolation == 'nearest':
412            f = interp1d(qx,x[idx],kind='nearest')
413        elif interpolation == 'lower':
414            f = interp1d(qx,x[idx],kind='previous')
415        elif interpolation == 'higher':
416            f = interp1d(qx,x[idx],kind='next')
417        elif interpolation == 'midpoint':
418            # Average the lower and higher values
419            def f(q):
420                return ( interp1d(qx,x[idx],kind='previous')(q) +
421                         interp1d(qx,x[idx],kind='next')(q)     ) / 2
422        else:
423            raise ValueError('Unrecognized value for interpolation: ' + interpolation)
424
425        xq = f(q)
426
427    return xq
def wcorr(x, y, w=None, robust=False):
165def wcorr(x,y,w=None,robust=False):
166    '''Weighted correlation coeffient
167    
168    Calculate the Pearson linear correlation coefficient of x and y using weights w. 
169    This is derived from the weighted covariance and weighted variance.
170    Equivalent to R2w in Eq. 4 from Willett and Singer (1988).
171    See notes below on interpretation.
172    
173    Parameters
174    ----------
175    x, y : array_like 
176        values to be analyzed
177    w : array_like, default=None
178        weights for each element of x; can be ommitted if robust=True
179    ddof : int, default=1
180        differential degrees of freedom. See note above.
181    robust : bool, default=False
182        if robust=True, weights will be internally calculated using FastMCD;
183        ignored if w is used
184        
185    Returns
186    -------
187    float
188        weighted correlation coefficient of x and y
189
190    Notes
191    -----
192    The weighted correlation coefficient here may be usedful to construct a 
193    a "weighted R2" or "weighted coefficient of determination" from a 
194    Weighted Least Squares regression: R2w = wcorr(x,y,w)**2. The resulting R2w is
195    equivalent to equation 4 from Willett and Singer (1988, American Statistician) 
196    and their caveats apply. 
197
198    Craig Milligan provides the following advice in an online forum:
199    "What is the meaning of this (corrected) weighted r-squared? 
200    Willett and Singer interpret it as: "the coefficient of determination in the 
201    transformed [weighted] dataset. It is a measure of the proportion of the 
202    variation in weighted Y that can be accounted for by weighted X, and is the 
203    quantity that is output as R2 by the major statistical computer packages when a 
204    WLS regression is performed".
205
206    Is it meaningful as a measure of goodness of fit? 
207    This depends on how it is presented and interpreted. Willett and Singer caution 
208    that it is typically quite a bit higher than the r-squared obtained in ordinary 
209    least squares regression, and the high value encourages prominent display... 
210    but this display may be deceptive IF it is interpreted in the conventional 
211    sense of r-squared (as the proportion of unweighted variation explained by a 
212    model). Willett and Singer propose that a less 'deceptive' alternative is 
213    their equation 7 [usual unweighted R2, called pseudor2wls by Willett and Singer]. 
214    In general, Willett and Singer also caution that it is 
215    not good to rely on any r2 (even their pseudor2wls) as a sole measure of 
216    goodness of fit. Despite these cautions, the whole premise of robust regression 
217    is that some cases are judged 'not as good' and don't count as much in the model 
218    fitting, and it may be good to reflect this in part of the model assessment 
219    process. The weighted r-squared described, can be one good measure of goodness 
220    of fit - as long as the correct interpretation is clearly given in the 
221    presentation and it is not relied on as the sole assessment of goodness of fit."
222
223    Willett, J. B. and Singer, J. D.: Another Cautionary Note About R2: 
224        Its Use in Weighted Least-Squares Regression-Analysis, American Statistician, 
225        42(3), 236–238, 1988.
226    '''
227
228    assert len(y) == len(x), 'y must be the same length as x'
229
230    if w is None:
231        if robust:
232            # Use FastMCD to calculate weights; Another method could be used here
233            w = MinCovDet().fit( np.array([x,x]).T ).support_
234        else:
235            raise ValueError('must specify weights w or select robust=True')
236    else:
237        assert len(w) == len(x), 'w must be the same length as x'
238
239    w = _wscale(w)
240    return wcov(x,y,w) / np.sqrt( wvar(x,w) * wvar(y,w) )

Weighted correlation coeffient

Calculate the Pearson linear correlation coefficient of x and y using weights w. This is derived from the weighted covariance and weighted variance. Equivalent to R2w in Eq. 4 from Willett and Singer (1988). See notes below on interpretation.

Parameters
  • x, y (array_like): values to be analyzed
  • w (array_like, default=None): weights for each element of x; can be ommitted if robust=True
  • ddof (int, default=1): differential degrees of freedom. See note above.
  • robust (bool, default=False): if robust=True, weights will be internally calculated using FastMCD; ignored if w is used
Returns
  • float: weighted correlation coefficient of x and y
Notes

The weighted correlation coefficient here may be usedful to construct a a "weighted R2" or "weighted coefficient of determination" from a Weighted Least Squares regression: R2w = wcorr(x,y,w)**2. The resulting R2w is equivalent to equation 4 from Willett and Singer (1988, American Statistician) and their caveats apply.

Craig Milligan provides the following advice in an online forum: "What is the meaning of this (corrected) weighted r-squared? Willett and Singer interpret it as: "the coefficient of determination in the transformed [weighted] dataset. It is a measure of the proportion of the variation in weighted Y that can be accounted for by weighted X, and is the quantity that is output as R2 by the major statistical computer packages when a WLS regression is performed".

Is it meaningful as a measure of goodness of fit? This depends on how it is presented and interpreted. Willett and Singer caution that it is typically quite a bit higher than the r-squared obtained in ordinary least squares regression, and the high value encourages prominent display... but this display may be deceptive IF it is interpreted in the conventional sense of r-squared (as the proportion of unweighted variation explained by a model). Willett and Singer propose that a less 'deceptive' alternative is their equation 7 [usual unweighted R2, called pseudor2wls by Willett and Singer]. In general, Willett and Singer also caution that it is not good to rely on any r2 (even their pseudor2wls) as a sole measure of goodness of fit. Despite these cautions, the whole premise of robust regression is that some cases are judged 'not as good' and don't count as much in the model fitting, and it may be good to reflect this in part of the model assessment process. The weighted r-squared described, can be one good measure of goodness of fit - as long as the correct interpretation is clearly given in the presentation and it is not relied on as the sole assessment of goodness of fit."

Willett, J. B. and Singer, J. D.: Another Cautionary Note About R2: Its Use in Weighted Least-Squares Regression-Analysis, American Statistician, 42(3), 236–238, 1988.

def wcorrcoef(x, y, w=None, robust=False):
165def wcorr(x,y,w=None,robust=False):
166    '''Weighted correlation coeffient
167    
168    Calculate the Pearson linear correlation coefficient of x and y using weights w. 
169    This is derived from the weighted covariance and weighted variance.
170    Equivalent to R2w in Eq. 4 from Willett and Singer (1988).
171    See notes below on interpretation.
172    
173    Parameters
174    ----------
175    x, y : array_like 
176        values to be analyzed
177    w : array_like, default=None
178        weights for each element of x; can be ommitted if robust=True
179    ddof : int, default=1
180        differential degrees of freedom. See note above.
181    robust : bool, default=False
182        if robust=True, weights will be internally calculated using FastMCD;
183        ignored if w is used
184        
185    Returns
186    -------
187    float
188        weighted correlation coefficient of x and y
189
190    Notes
191    -----
192    The weighted correlation coefficient here may be usedful to construct a 
193    a "weighted R2" or "weighted coefficient of determination" from a 
194    Weighted Least Squares regression: R2w = wcorr(x,y,w)**2. The resulting R2w is
195    equivalent to equation 4 from Willett and Singer (1988, American Statistician) 
196    and their caveats apply. 
197
198    Craig Milligan provides the following advice in an online forum:
199    "What is the meaning of this (corrected) weighted r-squared? 
200    Willett and Singer interpret it as: "the coefficient of determination in the 
201    transformed [weighted] dataset. It is a measure of the proportion of the 
202    variation in weighted Y that can be accounted for by weighted X, and is the 
203    quantity that is output as R2 by the major statistical computer packages when a 
204    WLS regression is performed".
205
206    Is it meaningful as a measure of goodness of fit? 
207    This depends on how it is presented and interpreted. Willett and Singer caution 
208    that it is typically quite a bit higher than the r-squared obtained in ordinary 
209    least squares regression, and the high value encourages prominent display... 
210    but this display may be deceptive IF it is interpreted in the conventional 
211    sense of r-squared (as the proportion of unweighted variation explained by a 
212    model). Willett and Singer propose that a less 'deceptive' alternative is 
213    their equation 7 [usual unweighted R2, called pseudor2wls by Willett and Singer]. 
214    In general, Willett and Singer also caution that it is 
215    not good to rely on any r2 (even their pseudor2wls) as a sole measure of 
216    goodness of fit. Despite these cautions, the whole premise of robust regression 
217    is that some cases are judged 'not as good' and don't count as much in the model 
218    fitting, and it may be good to reflect this in part of the model assessment 
219    process. The weighted r-squared described, can be one good measure of goodness 
220    of fit - as long as the correct interpretation is clearly given in the 
221    presentation and it is not relied on as the sole assessment of goodness of fit."
222
223    Willett, J. B. and Singer, J. D.: Another Cautionary Note About R2: 
224        Its Use in Weighted Least-Squares Regression-Analysis, American Statistician, 
225        42(3), 236–238, 1988.
226    '''
227
228    assert len(y) == len(x), 'y must be the same length as x'
229
230    if w is None:
231        if robust:
232            # Use FastMCD to calculate weights; Another method could be used here
233            w = MinCovDet().fit( np.array([x,x]).T ).support_
234        else:
235            raise ValueError('must specify weights w or select robust=True')
236    else:
237        assert len(w) == len(x), 'w must be the same length as x'
238
239    w = _wscale(w)
240    return wcov(x,y,w) / np.sqrt( wvar(x,w) * wvar(y,w) )

Weighted correlation coeffient

Calculate the Pearson linear correlation coefficient of x and y using weights w. This is derived from the weighted covariance and weighted variance. Equivalent to R2w in Eq. 4 from Willett and Singer (1988). See notes below on interpretation.

Parameters
  • x, y (array_like): values to be analyzed
  • w (array_like, default=None): weights for each element of x; can be ommitted if robust=True
  • ddof (int, default=1): differential degrees of freedom. See note above.
  • robust (bool, default=False): if robust=True, weights will be internally calculated using FastMCD; ignored if w is used
Returns
  • float: weighted correlation coefficient of x and y
Notes

The weighted correlation coefficient here may be usedful to construct a a "weighted R2" or "weighted coefficient of determination" from a Weighted Least Squares regression: R2w = wcorr(x,y,w)**2. The resulting R2w is equivalent to equation 4 from Willett and Singer (1988, American Statistician) and their caveats apply.

Craig Milligan provides the following advice in an online forum: "What is the meaning of this (corrected) weighted r-squared? Willett and Singer interpret it as: "the coefficient of determination in the transformed [weighted] dataset. It is a measure of the proportion of the variation in weighted Y that can be accounted for by weighted X, and is the quantity that is output as R2 by the major statistical computer packages when a WLS regression is performed".

Is it meaningful as a measure of goodness of fit? This depends on how it is presented and interpreted. Willett and Singer caution that it is typically quite a bit higher than the r-squared obtained in ordinary least squares regression, and the high value encourages prominent display... but this display may be deceptive IF it is interpreted in the conventional sense of r-squared (as the proportion of unweighted variation explained by a model). Willett and Singer propose that a less 'deceptive' alternative is their equation 7 [usual unweighted R2, called pseudor2wls by Willett and Singer]. In general, Willett and Singer also caution that it is not good to rely on any r2 (even their pseudor2wls) as a sole measure of goodness of fit. Despite these cautions, the whole premise of robust regression is that some cases are judged 'not as good' and don't count as much in the model fitting, and it may be good to reflect this in part of the model assessment process. The weighted r-squared described, can be one good measure of goodness of fit - as long as the correct interpretation is clearly given in the presentation and it is not relied on as the sole assessment of goodness of fit."

Willett, J. B. and Singer, J. D.: Another Cautionary Note About R2: Its Use in Weighted Least-Squares Regression-Analysis, American Statistician, 42(3), 236–238, 1988.

def wcov(x, y, w=None, ddof=1, robust=False):
121def wcov(x,y,w=None,ddof=1,robust=False):
122    '''Weighted covariance 
123    
124    Calculate the covariance of x and y using weights w. If ddof=1 (default),
125    then the result is the unbiased (sample) covariance when w=1.
126    
127    Implements weighted covariance as defined by NIST Dataplot 
128    https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf
129    
130    Parameters
131    ----------
132    x, y : array_like 
133        values to be analyzed
134    w : array_like, default=None
135        weights for each element of x; can be ommitted if robust=True
136    ddof : int (default=1)
137        differential degrees of freedom. See note above.
138    robust : bool, default=False 
139        if robust=True, weights will be internally calculated using FastMCD;
140        ignored if w is used
141        
142    Returns
143    -------
144    float
145        weighted covariance of x and y
146    '''
147
148    assert len(y) == len(x), 'y must be the same length as x'
149
150    if w is None:
151        if robust:
152            # Use FastMCD to calculate weights; Another method could be used here
153            w = MinCovDet().fit( np.array([x,x]).T ).support_
154        else:
155            raise ValueError('must specify weights w or select robust=True')
156    else:
157        assert len(w) == len(x), 'w must be the same length as x'
158
159    w = _wscale(w)
160    nw = np.count_nonzero(w)
161
162    return np.sum( ( x - wmean(x,w) ) * ( y - wmean(y,w) ) * w ) / \
163        ( np.sum(w) / nw * (nw - ddof) )

Weighted covariance

Calculate the covariance of x and y using weights w. If ddof=1 (default), then the result is the unbiased (sample) covariance when w=1.

Implements weighted covariance as defined by NIST Dataplot https://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf

Parameters
  • x, y (array_like): values to be analyzed
  • w (array_like, default=None): weights for each element of x; can be ommitted if robust=True
  • ddof (int (default=1)): differential degrees of freedom. See note above.
  • robust (bool, default=False): if robust=True, weights will be internally calculated using FastMCD; ignored if w is used
Returns
  • float: weighted covariance of x and y
def wmean(x, w=None, robust=False):
22def wmean(x,w=None,robust=False):
23    '''Weighted mean 
24    
25    Calculate the mean of x using weights w.
26    
27    Parameters
28    ----------
29    x : array_like 
30        values to be averaged
31    w : array_like, default=None
32        weights for each element of x; can be ommitted if robust=True
33    robust : bool, default=False
34        if robust=True, weights will be internally calculated using FastMCD;
35        ignored if w is used
36        
37    Returns
38    -------
39    float
40        weighted mean of x 
41    '''
42    if w is None:
43        if robust:
44            # Use FastMCD to calculate weights; Another method could be used here
45            w = MinCovDet().fit( np.array([x,x]).T ).support_
46        else:
47            raise ValueError('must specify weights w or select robust=True')
48    else:
49        assert len(w) == len(x), 'w must be the same length as x'
50
51    return np.sum( x * w ) / np.sum(w)

Weighted mean

Calculate the mean of x using weights w.

Parameters
  • x (array_like): values to be averaged
  • w (array_like, default=None): weights for each element of x; can be ommitted if robust=True
  • robust (bool, default=False): if robust=True, weights will be internally calculated using FastMCD; ignored if w is used
Returns
  • float: weighted mean of x
def wmedian(x, w, **kwargs):
264def wmedian(x,w,**kwargs):
265    '''Weighted median
266
267    See documentation for `wquantile`
268
269    Parameters
270    ----------
271    x : array_like 
272        values to be analyzed
273    w : array_like
274        weights for each element of x, representing frequency
275    **kwargs passed to `wquantile`
276
277    Returns
278    -------
279    float
280        weighted median of x 
281
282    '''
283    return wquantile(x,0.5,w,**kwargs)

Weighted median

See documentation for wquantile

Parameters
  • x (array_like): values to be analyzed
  • w (array_like): weights for each element of x, representing frequency
  • **kwargs passed to wquantile
Returns
  • float: weighted median of x
def wquantile(x, q, w, interpolation='partition'):
285def wquantile(x,q,w,interpolation='partition'):
286    '''Weighted quantile 
287    
288    Calculate the quantile q from data array x using data weights w.
289    If weights reflect the relative frequency of the elements of x in a large population,
290    then the weighted quantile result is equivalent to numpy.quantile or numpy.percentile 
291    operating on an array of that larger population (containing many repeated elements).
292
293    For uniform weights and interpolation=partition or partition0, this function 
294    differs from numpy.percentile. This behavior is expected and desirable.
295    The numpy.percentile behaviro can be reproduced here by using 
296    interpolation = linear, lower, higher, or nearest.
297    Note that numpy.quantile is equivalent to numpy.percentile(interpolation='linear')
298    
299    This naive algorithm is O(n) and may be slow for large samples (x).
300    Consider using Robustats or another optimized package.       
301    
302    Parameters
303    ----------
304    x : array_like (N,)
305        values to be analyzed
306    q : list (m,)
307        quantiles that will be calculated, in range 0-1
308    w : array_like (N,)
309        weights for each element of x. 
310        These represent the frequency of elements x in a large population
311    interpolation : {'partition' (default),'partition0','linear','nearest','lower','higher'}
312        This parameter specifies the interpolation method to use when the desired quantile 
313        lies bewteen elements i < j in x. Allowed values:
314        - 'partition': [default] choose the element of x that partitions the 
315                        sum of weights on either side to q and (1-q)
316                        When two elements both satisfy partition, then average them.
317                        This is the Edgeworth method (https://en.wikipedia.org/wiki/Weighted_median)
318        - 'partition0': Same as partition, but result is always an element of x (no averaging). 
319                        Instead return the element of x that partitions weights most closely 
320                        to q and (1-q) or, if there is still a tie, then the smaller element.
321        - 'linear'  : i + (j-1) * fraction. replicates behavior of numpy.quantile when all 
322                    weights are equal
323        - 'nearest' : i or j element that most closely divides data at the q quantile
324        - 'lower'   : i, the largest element <= the q quantile
325        - 'higher'  : j, the smallest element >= the q quantile
326                         
327    Result
328    ------
329    array (m,)
330        weighted quantiles of x. Length is the same as input q
331    '''
332
333    # Ensure arguments are arrays
334    x = np.asarray( x )
335    w = np.asarray( w )
336
337    # Number of elements
338    n = len(x)
339
340    # Ensure weights are same length as x
341    if len(w) != n:
342        raise ValueError( 'weights w must be the same length as array x')
343
344    # Ensure inputs are all finite, no NaN or Inf
345    if np.any( ~np.isfinite(x) ):
346        raise ValueError( 'Array x contains non-finite elements')
347    if np.any( ~np.isfinite(w) ):
348        raise ValueError( 'Weights w contains non-finite elements')
349
350    # Sort x from smallest to largest
351    idx = np.argsort( x )
352
353    if interpolation in ['partition','partition2']:
354
355        # To calculate multiple quantiles, call function iteratively for each quantile requested
356        if isinstance(q, (list, tuple, np.ndarray)):
357            return [wquantile(x,qi,w,interpolation) for qi in q]
358
359        # Cumulative sum of weights, divided by sum
360        wsum = np.cumsum( w[idx] ) / np.sum( w )
361        # Reverse cumulative sum (cumulative sum of elements in reverse order)
362        wsumr = np.cumsum( w[idx][::-1] )[::-1] / np.sum( w )
363
364        # Lower bound for quantile; il is an index into the sorted array
365        if q <= wsum[0]:
366            il = 0
367        else:
368            il   = np.flatnonzero( wsum < q )[-1] + 1
369
370        # Upper bound for quantile; iu is an index into the sorted array
371        if (1-q) <= wsumr[-1]:
372            iu = n-1
373        else:
374            iu   = np.flatnonzero( wsumr < (1-q) )[0] - 1
375
376        if il == iu:
377            # Upper and lower bounds are the same; we're done
378            xq = x[idx[il]]
379
380        else:
381            # Several methods for reconciling different upper and lower bounds
382
383            if interpolation == 'partition':
384                # Average the upper and lower bounds
385                # This creates an element not found in the input array,
386                # which may be inappropriate in some cases
387                xq = np.mean( x[idx[[il,iu]]] )
388
389            else:
390                # Choose the element with the smaller weight
391                # This guarantees that the value is an element of the input array
392                if w[idx[il]] <= w[idx[iu]]:
393                    iq = il
394                else:
395                    iq = iu
396                xq = x[idx[iq]]
397
398    else:
399
400        # These methods give the same results as numpy.percentile when using
401        # the same interpolation method and uniform weights.
402
403        # Define the quantile for each element in x
404        w         = w.astype(np.float32)
405        qx        = w[idx] / 2 - w[idx][0] / 2
406        qx[1:]   += np.cumsum( w[idx] )[:-1]
407        qx       /= qx[-1]
408
409        # Interpolate to get quantile value
410        if interpolation == 'linear':
411            f = interp1d(qx,x[idx],kind='linear')
412        elif interpolation == 'nearest':
413            f = interp1d(qx,x[idx],kind='nearest')
414        elif interpolation == 'lower':
415            f = interp1d(qx,x[idx],kind='previous')
416        elif interpolation == 'higher':
417            f = interp1d(qx,x[idx],kind='next')
418        elif interpolation == 'midpoint':
419            # Average the lower and higher values
420            def f(q):
421                return ( interp1d(qx,x[idx],kind='previous')(q) +
422                         interp1d(qx,x[idx],kind='next')(q)     ) / 2
423        else:
424            raise ValueError('Unrecognized value for interpolation: ' + interpolation)
425
426        xq = f(q)
427
428    return xq

Weighted quantile

Calculate the quantile q from data array x using data weights w. If weights reflect the relative frequency of the elements of x in a large population, then the weighted quantile result is equivalent to numpy.quantile or numpy.percentile operating on an array of that larger population (containing many repeated elements).

For uniform weights and interpolation=partition or partition0, this function differs from numpy.percentile. This behavior is expected and desirable. The numpy.percentile behaviro can be reproduced here by using interpolation = linear, lower, higher, or nearest. Note that numpy.quantile is equivalent to numpy.percentile(interpolation='linear')

This naive algorithm is O(n) and may be slow for large samples (x). Consider using Robustats or another optimized package.

Parameters
  • x (array_like (N,)): values to be analyzed
  • q (list (m,)): quantiles that will be calculated, in range 0-1
  • w (array_like (N,)): weights for each element of x. These represent the frequency of elements x in a large population
  • interpolation ({'partition' (default),'partition0','linear','nearest','lower','higher'}): This parameter specifies the interpolation method to use when the desired quantile lies bewteen elements i < j in x. Allowed values:
    • 'partition': [default] choose the element of x that partitions the sum of weights on either side to q and (1-q) When two elements both satisfy partition, then average them. This is the Edgeworth method (https://en.wikipedia.org/wiki/Weighted_median)
    • 'partition0': Same as partition, but result is always an element of x (no averaging). Instead return the element of x that partitions weights most closely to q and (1-q) or, if there is still a tie, then the smaller element.
    • 'linear' : i + (j-1) * fraction. replicates behavior of numpy.quantile when all weights are equal
    • 'nearest' : i or j element that most closely divides data at the q quantile
    • 'lower' : i, the largest element <= the q quantile
    • 'higher' : j, the smallest element >= the q quantile
Result

array (m,) weighted quantiles of x. Length is the same as input q

def wstd(x, w=None, ddof=1, robust=False):
53def wstd(x,w=None,ddof=1,robust=False):
54    '''Weighted standard deviation
55    
56    Calculate the standard deviation of x using weights w. If ddof=1 (default),
57    then the result is the unbiased (sample) standard deviation when w=1.
58    
59    Parameters
60    ----------
61    x : array_like 
62        values to be analyzed
63    w : array_like, default=None
64        weights for each element of x; can be ommitted if robust=True
65    ddof : int, default=1
66        differential degrees of freedom. See note above.
67    robust : bool, default=False
68        if robust=True, weights will be internally calculated using FastMCD;
69        ignored if w is used
70        
71    Returns
72    -------
73    float
74        weighted standard deviation of x 
75    '''
76    if w is None:
77        if robust:
78            # Use FastMCD to calculate weights; Another method could be used here
79            w = MinCovDet().fit( np.array([x,x]).T ).support_
80        else:
81            raise ValueError('must specify weights w or select robust=True')
82    else:
83        assert len(w) == len(x), 'w must be the same length as x'
84
85    return np.sqrt( wcov(x,x,w,ddof,robust) )

Weighted standard deviation

Calculate the standard deviation of x using weights w. If ddof=1 (default), then the result is the unbiased (sample) standard deviation when w=1.

Parameters
  • x (array_like): values to be analyzed
  • w (array_like, default=None): weights for each element of x; can be ommitted if robust=True
  • ddof (int, default=1): differential degrees of freedom. See note above.
  • robust (bool, default=False): if robust=True, weights will be internally calculated using FastMCD; ignored if w is used
Returns
  • float: weighted standard deviation of x
def wvar(x, w=None, ddof=1, robust=False):
 87def wvar(x,w=None,ddof=1,robust=False):
 88    '''Weighted variance 
 89    
 90    Calculate the variance of x using weights w. If ddof=1 (default),
 91    then the result is the unbiased (sample) variance when w=1.
 92    
 93    Parameters
 94    ----------
 95    x : array_like 
 96        values to be analyzed
 97    w : array_like, default=None
 98        weights for each element of x; can be ommitted if robust=True
 99    ddof : int, default=1
100        differential degrees of freedom. See note above.
101    robust : bool, default=False
102        if robust=True, weights will be internally calculated using FastMCD;
103        ignored if w is used
104        
105    Returns
106    -------
107    float
108        weighted variance of x 
109    '''
110    if w is None:
111        if robust:
112            # Use FastMCD to calculate weights; Another method could be used here
113            w = MinCovDet().fit( np.array([x,x]).T ).support_
114        else:
115            raise ValueError('must specify weights w or select robust=True')
116    else:
117        assert len(w) == len(x), 'w must be the same length as x'
118
119    return wcov(x,x,w,ddof,robust)

Weighted variance

Calculate the variance of x using weights w. If ddof=1 (default), then the result is the unbiased (sample) variance when w=1.

Parameters
  • x (array_like): values to be analyzed
  • w (array_like, default=None): weights for each element of x; can be ommitted if robust=True
  • ddof (int, default=1): differential degrees of freedom. See note above.
  • robust (bool, default=False): if robust=True, weights will be internally calculated using FastMCD; ignored if w is used
Returns
  • float: weighted variance of x