acgc.stats.bivariate

Bivariate statistics

Statistical measures of relationships between two populations

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3""" Bivariate statistics
  4
  5Statistical measures of relationships between two populations
  6"""
  7
  8import numpy as np
  9from scipy import stats
 10from .bivariate_lines import sen, sma, bivariate_line_equation
 11# import xarray as xr
 12
 13__all__ = [
 14    "BivariateStatistics",
 15    "nmb",
 16    "nmae",
 17    "nmbf",
 18    "nmaef"
 19]
 20
 21def nmb( x0, x1 ):
 22    '''Compute Normalized Mean Bias (NMB)
 23
 24    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
 25
 26    Parameters
 27    ----------
 28    x0 : array_like
 29        reference values
 30    x1 : array_like
 31        experiment values
 32    '''
 33
 34    assert (len(x0) == len(x1)), \
 35        "Parameters x0 and x1 must have the same length"
 36
 37    # Mean values
 38    x0_mean = np.mean(x0)
 39    x1_mean = np.mean(x1)
 40
 41    # Metric value
 42    return x1_mean / x0_mean - 1
 43
 44def nmae( x0, x1 ):
 45    '''Compute Normalized Mean Absolute Error (NMAE)
 46
 47    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
 48
 49    Parameters
 50    ---------
 51    x0 : array_like
 52        reference values
 53    x1 : array_like
 54        experiment values
 55    '''
 56
 57     # Mean values
 58    x0_mean = np.mean(x0)
 59
 60    # Mean absolute difference
 61    abs_diff = np.mean( np.abs(x1 - x0) )
 62
 63    # Metric value
 64    return abs_diff / np.abs( x0_mean )
 65
 66
 67def nmbf( x0, x1 ):
 68    '''Compute Normalized Mean Bias Factor (NMBF)
 69
 70    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
 71
 72    Parameters
 73    ----------
 74    x0 : array_like
 75        reference values
 76    x1 : array_like
 77        experiment values
 78    '''
 79
 80    # Ensure that arguments have the same length
 81    assert (len(x0) == len(x1)), \
 82        "Parameters x0 and x1 must have the same length"
 83
 84    # Mean values
 85    x0_mean = np.mean(x0)
 86    x1_mean = np.mean(x1)
 87
 88    # Metric value
 89    if x1_mean >= x0_mean:
 90        result = x1_mean / x0_mean - 1
 91    else:
 92        result= 1 - x0_mean / x1_mean
 93    # Equivalent (faster?) implementation
 94    #S = (mMean - oMean) / np.abs(mMean - oMean)
 95    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
 96
 97    return result
 98
 99def nmaef( x0, x1 ):
100    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
101
102    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
103    
104    Parameters
105    ----------
106    x0 : array_like
107        reference values
108    x1 : array_like
109        experiment values
110    '''
111
112    # Ensure that arguments have the same length
113    assert (len(x0) == len(x1)), \
114        "Parameters x0 and x1 must have the same length"
115
116    # Mean values
117    x0_mean = np.mean(x0)
118    x1_mean = np.mean(x1)
119
120    # Mean absolute difference
121    abs_diff = np.mean( np.abs(x1 - x0))
122
123    # Metric value
124    if x1_mean >= x0_mean:
125        result = abs_diff / x0_mean 
126    else:
127        result = abs_diff / x1_mean
128    # Equivalent (faster?) implementation
129    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
130    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
131
132    return result
133
134def _texify_name(name):
135    '''Return a LaTex formatted string for some variables
136    
137    Parameter
138    ---------
139    name : str
140    
141    Returns
142    -------
143    pretty_name : str
144    '''
145    if name=='R2':
146        pretty_name = f'$R^2$'
147    elif name=='r2':
148        pretty_name = f'$r^2$'
149    elif name.lower()=='y_ols':
150        pretty_name = r'$y_{\rm OLS}$'
151    elif name.lower()=='y_sma':
152        pretty_name = r'$y_{\rm SMA}$'
153    elif name.lower()=='y_sen':
154        pretty_name = r'$y_{\rm Sen}$'
155    else:
156        pretty_name = name
157    return pretty_name
158
159class BivariateStatistics:
160    '''A suite of common statistics to quantify bivariate relationships
161
162    Class method 'summary' provides a formatted summary of these statistics
163    
164    Attributes
165    ----------
166    xmean, ymean : float
167        mean of x and y variables
168    xmedian, ymedian :float
169        median of x and y variables
170    xstd, ystd : float
171        standard deviation of x and y variables
172    mean_difference, md : float
173        ymean - xmean
174    mean_absolute_difference, mad : float
175        mean( |y-x| )
176    relative_mean_difference, rmd : float
177        md / xmean
178    relative_mean_absolute_difference, rmad :float
179        mad / xmean
180    standardized_mean_difference, smd : float
181        md / xstd
182    standardized_mean_absolute_difference, smad : float
183        mad /xstd
184    mean_relative_difference, mrd : float
185        mean(y/x) - 1
186    mean_log10_ratio, mlr : float
187        mean( log10(y/x) )
188    mean_absolute_log10_ratio, malr : float
189        mean( abs( log10(y/x) ) )
190    median_difference, medd : float
191        median(y-x)
192    median_absolute_difference, medad : float
193        median(|y-x|)
194    relative_median_difference, rmedd : float
195        median(y-x) / xmedian
196    relative_median_absolute_difference, rmedad : float
197        median(|y-x|) / xmedian
198    median_relative_difference, medianrd, medrd : float
199        median(y/x)-1
200    median_log10_ratio, medlr : float
201        median( log10(y/x) )
202    median_absolute_log10_ratio, medalr : float
203        median( abs( log10(y/x) ) )
204    normalized_mean_bias_factor, nmbf : float
205        see `nmbf` 
206    normalized_mean_absolute_error_factor, nmaef : float
207        see `nmaef`
208    root_mean_square_difference, rmsd : float
209        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
210    covariance : float
211        cov(x,y)
212    correlation_pearson, correlation, pearsonr, R, r : float
213        Pearson linear correlation coefficient 
214    correlation_spearman, spearmanr : float
215        Spearman, non-parametric rank correlation coefficient
216    R2, r2 : float
217        Linear coefficient of determination, $R^2$
218    '''
219
220    def __init__(self,x,y,w=None,dropna=False,data=None):
221        '''Compute suite of bivariate statistics during initialization
222        
223        Statistic values are saved in attributes.
224        CAUTION: Weights w are ignored except in SMA fit
225
226        Parameters
227        ----------
228        x : ndarray or str
229            independent variable values
230        y : ndarray or str
231            dependent variable values, same size as x
232        w : ndarray or str, optional
233            weights for points (x,y), same size as x and y
234        dropna : bool, optional (default=False)
235            drops NaN values from x, y, and w
236        data : dict-like, optional
237            if x, y, or w are str, then they should be keys in data
238        '''
239
240        # Get values from data if needed
241        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
242            raise ValueError( 'Data argument must be used if x, y, or w is a string')
243        if isinstance(x,str):
244            x = data[x]
245        if isinstance(y,str):
246            y = data[y]
247        if isinstance(w,str):
248            w = data[w]
249
250        #Ensure that x and y have same length
251        if len(x) != len(y):
252            raise ValueError( 'Arguments x and y must have the same length' )
253        if w is None:
254            w = np.ones_like(x)
255        if len(w) != len(x):
256            raise ValueError( 'Argument w (if present) must have the same length as x' )
257
258        # Drop NaN values
259        if dropna:
260            isna = np.isnan(x*y*w)
261            x = x[~isna]
262            y = y[~isna]
263            w = w[~isna]
264
265        diff = y - x
266        absdiff = np.abs( y - x )
267        # Ignore divide by zero and 0/0 while dividing
268        old_settings = np.seterr(divide='ignore',invalid='ignore')
269        ratio = y/x
270        log10ratio = np.log10(ratio)
271        np.seterr(**old_settings)
272
273        # Means, medians, and standard deviations
274        self.xmean = np.mean(x)
275        self.ymean = np.mean(y)
276        self.xmedian = np.median(x)
277        self.ymedian = np.median(y)
278        self.xstd   = np.std(x)
279        self.ystd   = np.std(y)
280
281        # Save values for use later
282        self._x = x
283        self._y = y
284        self._w = w
285
286        # Mean and mean absolute differences
287        self.mean_difference            = self.md   = self.ymean - self.xmean
288        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
289
290        # Relative and standardized differences
291        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
292        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
293        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
294        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
295
296        # Mean and median relative differences
297        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
298        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
299
300        # Median and median absolute differences
301        self.median_difference          = self.medd  = np.median( diff )
302        self.median_absolute_difference = self.medad = np.median( absdiff )
303
304        # Relative median differences
305        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
306        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
307
308        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
309        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
310
311        # Mean and mean absolute log ratio
312        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
313        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
314        
315        # Median and median absolute log ratio
316        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
317        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
318        
319        # RMS difference
320        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
321
322        # Covariance, correlation
323        self.covariance = np.cov(x,y)[0][1]
324        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
325            np.corrcoef(x,y)[0][1]
326        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
327        self.R2 = self.r2 = self.R**2
328
329    def __getitem__(self,key):
330        '''Accesses attribute values via object['key']'''
331        return getattr(self,key)
332
333    def fitline(self,method='sma',intercept=True,**kwargs):
334        '''Compute bivariate line fit
335        
336        Parameters
337        ----------
338        method : str
339            line fitting method: sma (default), ols, wls, York, sen, siegel
340        intercept : bool
341            defines whether non-zero intercept should be fitted
342        **kwargs 
343            passed to `acgc.stats.sma` (e.g. robust=True)
344
345        Returns
346        -------
347        result : dict
348            dictionary with keys:
349            - slope (float)
350                slope of fitted line
351            - intercept (float)
352                intercept of fitted line
353            - fittedvalues (array (N,))
354                values on fit line
355            - residuals (array (N,))
356                residual from fit line
357        '''
358
359        fitintercept = intercept
360
361        if method.lower()=='sma':
362            fit = sma(  self._x,
363                        self._y,
364                        self._w,
365                        intercept=fitintercept,
366                        **kwargs)
367            slope = fit['slope']
368            intercept= fit['intercept']
369
370        elif method.lower()=='ols':
371            if fitintercept:
372                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
373                                      self._y, rcond=None )
374            else:
375                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
376            slope = ols[0][0]
377            intercept = ols[0][1]
378
379        elif method.lower() in ['theil','sen','theilsen']:
380            fitintercept = True
381            fit = sen( self._x,
382                       self._y,
383                       **kwargs)
384            slope = fit.slope
385            intercept = fit.intercept
386
387        elif method.lower()=='siegel':
388            fitintercept = True
389            siegel = stats.siegelslopes( self._x,
390                                         self._y )
391            slope = siegel.slope
392            intercept = siegel.intercept
393
394        elif method.lower()=='wls':
395            raise NotImplementedError('WLS regression not implemented yet')
396
397        elif method.lower()=='york':
398            raise NotImplementedError('York regression not implemented yet')
399
400        else:
401            raise ValueError('Undefined method '+method)
402
403        line = dict( slope          = slope,
404                     intercept      = intercept,
405                     fittedvalues   = slope * self._x + intercept,
406                     residuals      = self._y - ( slope * self._x + intercept ),
407                     method         = method,
408                     fitintercept   = fitintercept )
409
410        return line
411
412    def slope(self,method='sma',intercept=True,**kwargs):
413        '''Compute slope of bivariate line fit
414        
415        Parameters
416        ----------
417        method : str
418            line fitting method: sma (default), ols, wls
419        intercept : bool
420            defines whether non-zero intercept should be fitted
421        **kwargs 
422            passed to `fitline`
423
424        Returns
425        -------
426        slope : float
427            value of y intercept
428        '''
429        return self.fitline(method,intercept,**kwargs)['slope']
430
431    def intercept(self,method='sma',intercept=True,**kwargs):
432        '''Compute intercept of bivariate line fit
433        
434        Parameters
435        ----------
436        method : str
437            line fitting method: sma (default) or ols
438        intercept : bool
439            defines whether non-zero intercept should be fitted
440        **kwargs 
441            passed to `fitline`
442
443        Returns
444        -------
445        intercept : float
446            value of y intercept
447        '''
448        return self.fitline(method,intercept,**kwargs)['intercept']
449
450    def _expand_variables(self,variables):
451        '''Expand special strings into a list of variables
452        
453        Parameter
454        ---------
455        variables : list or str, default='common'
456            Special strings ("all","common") will be expanded to a list of variables
457            list arguments will not be modified
458
459        Returns
460        -------
461        list 
462            variable names
463        '''
464        if variables is None:
465            variables='common'
466        if variables=='all':
467            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
468                       'MedD','MedAD','RMedD','RMedAD','MedRD',
469                       'NMBF','NMAEF','RMSD',
470                       'R','R2','spearmanr','slope','intercept',
471                       'fitline']
472        elif variables=='common':
473            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope']
474        if not isinstance(variables,list):
475            raise ValueError(
476                'variables must be a list, None, or one of these strings: "all","common"')
477
478        return variables
479
480    def summary_dict(self, variables=None,
481                     fitline_kw=None,
482                     floatformat_fiteqn='{:.3f}' ):
483        '''Summarize bivariate statistics into a dict
484
485        Parameters
486        ----------
487        vars : list or str, default='common'
488            names of attribute variables to include in summary
489            names are case insensitive            
490            The following strings are also accepted in place of a list 
491                "all" (displays all variables)
492                "common" (displays all measures of mean difference)
493        fitline_kw : dict, default=None)
494            keywords passed to self.fitline()
495        
496        Returns
497        -------
498        summary : dict
499            names and values of variables
500        '''
501
502        # List of variables
503        variables = self._expand_variables(variables)
504
505        if fitline_kw is None:
506            fitline_kw = {'method':'sma',
507                          'intercept':True}
508
509        # Construct the dict
510        summary = {}
511        for v in variables:
512            if v in ['slope','intercept']:
513                # These variables are object methods
514                func = getattr(self,v)
515                value = func(**fitline_kw)
516            elif v == 'fitline':
517                line = self.fitline(**fitline_kw)
518                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
519            else:
520                # Retrieve values
521                value = getattr(self,v.lower())
522
523            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
524            summary[v] = value
525
526        return summary
527
528    def summary(self, variables=None, fitline_kw=None,
529                floatformat='{:.4f}', floatformat_fiteqn=None,
530                stringlength=None ):
531        '''Summarize bivariate statistics
532
533        Parameters
534        ----------
535        vars : list or str, default='common'
536            names of attribute variables to include in summary
537            names are case insensitive            
538            The following strings are also accepted in place of a list 
539                "all" (displays all variables)
540                "common" (displays all measures of mean difference)
541        floatformat : str, default='{:.4f}'
542            format specifier for floating point values
543        floatformat_fiteqn : str, default=floatformat
544            format specifier for slope and intercept (a,b) in y = a x + b
545        stringlength : int, default=None
546            length of the variables on output
547            default (None) is to use the length of the longest variable name
548        fitline_kw : dict, default=None
549            keywords passed to `fitline`
550        
551        Returns
552        -------
553        summary : str
554            names and values of variables
555        '''
556        # List of variables
557        variables = self._expand_variables(variables)
558
559        if floatformat_fiteqn is None:
560            floatformat_fiteqn = floatformat
561        if stringlength is None:
562            stringlength = np.max([len(v) for v in variables])
563        stringformat = '{:'+str(stringlength)+'s}'
564
565        # Get a dict containing the needed variables
566        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
567
568        # Extract length of the float numbers from floatformat
569        # import re
570        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
571        #       floatformat )[0] ) ).astype(int)
572
573        # summary = (stringformat+'{:>10s}').format('Variable','Value')
574        summarytext = ''
575        for k,v in summarydict.items():
576            if isinstance(v,str):
577                summarytext += (stringformat+' = {:s}\n').format(k,v)
578            else:
579                summarytext += (stringformat+' = '+floatformat+'\n').format(k,v)
580
581        return summarytext
582
583    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
584                          floatformat='{:.3f}', floatformat_fiteqn=None,
585                          loc=None, loc_units='axes',
586                          **kwargs):
587        '''Display bivariate statistics as a table inset on a plot axis
588
589        Parameters
590        ----------
591        ax : matplotlib.Figure.Axis 
592            axis where the table will be displayed
593        variables : list or str, default='common'
594            names of attribute variables to include in summary
595            names are case insensitive            
596            The following strings are also accepted in place of a list 
597                "all" (displays all variables)
598                "common" (displays all measures of mean difference)
599        fitline_kw : dict, default=None
600            keywords passed to `fitline`
601        floatformat : str, default='{:.3f}'
602            format specifier for floating point values
603        floatformat_fiteqn : str, default=floatformat
604            format specifier for slope and intercept (a,b) in y = a x + b
605        loc : tuple (x0,y0), default=(0.85, 0.05)
606            location on the axis where the table will be drawn
607            can be in data units or axes units [0-1]
608        loc_units : {'axes' (default), 'data'}
609            specifies whether loc has 'data' units or 'axes' units [0-1]
610                    
611        Returns
612        -------
613        text1, text2 : matplotlib text object
614            Artist for the two text boxes        
615        '''
616        # List of variables
617        variables = self._expand_variables(variables)
618
619        if floatformat_fiteqn is None:
620            floatformat_fiteqn = floatformat
621
622        # Default location in lower right corner
623        if loc is None:
624            loc = (0.8,0.05)
625
626        # Coordinates for loc
627        if loc_units.lower()=='data':
628            coord=ax.transData
629        elif loc_units.lower() in ['axes','axis']:
630            coord=ax.transAxes
631        else:
632            raise ValueError('Display units should be "Data" or "Axes"')
633
634        # Get a dict containing the needed variables
635        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
636
637        # Column of label text
638        label_text = '\n'.join([_texify_name(key) for key in summarydict])
639        # Column of value text
640        value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value)
641                                for value in summarydict.values()])
642
643        # Check if horizontal alignment keyword is used
644        ha=''
645        try:
646            ha = kwargs['ha']
647        except KeyError:
648            pass
649        try:
650            ha = kwargs['horizontalalignment']
651        except KeyError:
652            pass
653
654        # For right alignment, align on values first
655        # Otherwise, align on labels
656        if ha=='right':
657            first_text = value_text
658            second_text = label_text
659            sign = -1
660        else:
661            first_text = label_text
662            second_text = value_text
663            sign = +1
664
665        # Add first column of text
666        t1=ax.text(loc[0],loc[1],
667                first_text,
668                transform=coord,
669                **kwargs
670                )
671
672        # Get width of first text column
673        bbox = t1.get_window_extent().transformed(coord.inverted())
674        width = bbox.x1-bbox.x0
675
676        # Add second column of text
677        t2 = ax.text(loc[0]+width*sign,loc[1],
678                     second_text,
679                     transform=coord,
680                     **kwargs
681                     )
682
683        ##################################
684        # Early version of this function using matplotlib.table.table()
685
686        # if isinstance(loc,(tuple,list)):
687        #     # Create an inset axis to contain the table
688        #     tableaxis = ax.inset_axes(loc)
689        #     table_width=1
690        # else:
691        #     tableaxis = ax
692
693        # # Display the table on the axis
694        # return mtable.table(
695        #     tableaxis,
696        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
697        #     rowLabels=[texify_name(key) for key in summarydict],
698        #     colWidths=[table_width/2]*2,
699        #     edges=edges,
700        #     loc=loc, bbox=bbox
701        #     )
702
703        return [t1,t2]
class BivariateStatistics:
160class BivariateStatistics:
161    '''A suite of common statistics to quantify bivariate relationships
162
163    Class method 'summary' provides a formatted summary of these statistics
164    
165    Attributes
166    ----------
167    xmean, ymean : float
168        mean of x and y variables
169    xmedian, ymedian :float
170        median of x and y variables
171    xstd, ystd : float
172        standard deviation of x and y variables
173    mean_difference, md : float
174        ymean - xmean
175    mean_absolute_difference, mad : float
176        mean( |y-x| )
177    relative_mean_difference, rmd : float
178        md / xmean
179    relative_mean_absolute_difference, rmad :float
180        mad / xmean
181    standardized_mean_difference, smd : float
182        md / xstd
183    standardized_mean_absolute_difference, smad : float
184        mad /xstd
185    mean_relative_difference, mrd : float
186        mean(y/x) - 1
187    mean_log10_ratio, mlr : float
188        mean( log10(y/x) )
189    mean_absolute_log10_ratio, malr : float
190        mean( abs( log10(y/x) ) )
191    median_difference, medd : float
192        median(y-x)
193    median_absolute_difference, medad : float
194        median(|y-x|)
195    relative_median_difference, rmedd : float
196        median(y-x) / xmedian
197    relative_median_absolute_difference, rmedad : float
198        median(|y-x|) / xmedian
199    median_relative_difference, medianrd, medrd : float
200        median(y/x)-1
201    median_log10_ratio, medlr : float
202        median( log10(y/x) )
203    median_absolute_log10_ratio, medalr : float
204        median( abs( log10(y/x) ) )
205    normalized_mean_bias_factor, nmbf : float
206        see `nmbf` 
207    normalized_mean_absolute_error_factor, nmaef : float
208        see `nmaef`
209    root_mean_square_difference, rmsd : float
210        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
211    covariance : float
212        cov(x,y)
213    correlation_pearson, correlation, pearsonr, R, r : float
214        Pearson linear correlation coefficient 
215    correlation_spearman, spearmanr : float
216        Spearman, non-parametric rank correlation coefficient
217    R2, r2 : float
218        Linear coefficient of determination, $R^2$
219    '''
220
221    def __init__(self,x,y,w=None,dropna=False,data=None):
222        '''Compute suite of bivariate statistics during initialization
223        
224        Statistic values are saved in attributes.
225        CAUTION: Weights w are ignored except in SMA fit
226
227        Parameters
228        ----------
229        x : ndarray or str
230            independent variable values
231        y : ndarray or str
232            dependent variable values, same size as x
233        w : ndarray or str, optional
234            weights for points (x,y), same size as x and y
235        dropna : bool, optional (default=False)
236            drops NaN values from x, y, and w
237        data : dict-like, optional
238            if x, y, or w are str, then they should be keys in data
239        '''
240
241        # Get values from data if needed
242        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
243            raise ValueError( 'Data argument must be used if x, y, or w is a string')
244        if isinstance(x,str):
245            x = data[x]
246        if isinstance(y,str):
247            y = data[y]
248        if isinstance(w,str):
249            w = data[w]
250
251        #Ensure that x and y have same length
252        if len(x) != len(y):
253            raise ValueError( 'Arguments x and y must have the same length' )
254        if w is None:
255            w = np.ones_like(x)
256        if len(w) != len(x):
257            raise ValueError( 'Argument w (if present) must have the same length as x' )
258
259        # Drop NaN values
260        if dropna:
261            isna = np.isnan(x*y*w)
262            x = x[~isna]
263            y = y[~isna]
264            w = w[~isna]
265
266        diff = y - x
267        absdiff = np.abs( y - x )
268        # Ignore divide by zero and 0/0 while dividing
269        old_settings = np.seterr(divide='ignore',invalid='ignore')
270        ratio = y/x
271        log10ratio = np.log10(ratio)
272        np.seterr(**old_settings)
273
274        # Means, medians, and standard deviations
275        self.xmean = np.mean(x)
276        self.ymean = np.mean(y)
277        self.xmedian = np.median(x)
278        self.ymedian = np.median(y)
279        self.xstd   = np.std(x)
280        self.ystd   = np.std(y)
281
282        # Save values for use later
283        self._x = x
284        self._y = y
285        self._w = w
286
287        # Mean and mean absolute differences
288        self.mean_difference            = self.md   = self.ymean - self.xmean
289        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
290
291        # Relative and standardized differences
292        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
293        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
294        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
295        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
296
297        # Mean and median relative differences
298        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
299        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
300
301        # Median and median absolute differences
302        self.median_difference          = self.medd  = np.median( diff )
303        self.median_absolute_difference = self.medad = np.median( absdiff )
304
305        # Relative median differences
306        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
307        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
308
309        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
310        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
311
312        # Mean and mean absolute log ratio
313        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
314        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
315        
316        # Median and median absolute log ratio
317        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
318        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
319        
320        # RMS difference
321        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
322
323        # Covariance, correlation
324        self.covariance = np.cov(x,y)[0][1]
325        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
326            np.corrcoef(x,y)[0][1]
327        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
328        self.R2 = self.r2 = self.R**2
329
330    def __getitem__(self,key):
331        '''Accesses attribute values via object['key']'''
332        return getattr(self,key)
333
334    def fitline(self,method='sma',intercept=True,**kwargs):
335        '''Compute bivariate line fit
336        
337        Parameters
338        ----------
339        method : str
340            line fitting method: sma (default), ols, wls, York, sen, siegel
341        intercept : bool
342            defines whether non-zero intercept should be fitted
343        **kwargs 
344            passed to `acgc.stats.sma` (e.g. robust=True)
345
346        Returns
347        -------
348        result : dict
349            dictionary with keys:
350            - slope (float)
351                slope of fitted line
352            - intercept (float)
353                intercept of fitted line
354            - fittedvalues (array (N,))
355                values on fit line
356            - residuals (array (N,))
357                residual from fit line
358        '''
359
360        fitintercept = intercept
361
362        if method.lower()=='sma':
363            fit = sma(  self._x,
364                        self._y,
365                        self._w,
366                        intercept=fitintercept,
367                        **kwargs)
368            slope = fit['slope']
369            intercept= fit['intercept']
370
371        elif method.lower()=='ols':
372            if fitintercept:
373                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
374                                      self._y, rcond=None )
375            else:
376                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
377            slope = ols[0][0]
378            intercept = ols[0][1]
379
380        elif method.lower() in ['theil','sen','theilsen']:
381            fitintercept = True
382            fit = sen( self._x,
383                       self._y,
384                       **kwargs)
385            slope = fit.slope
386            intercept = fit.intercept
387
388        elif method.lower()=='siegel':
389            fitintercept = True
390            siegel = stats.siegelslopes( self._x,
391                                         self._y )
392            slope = siegel.slope
393            intercept = siegel.intercept
394
395        elif method.lower()=='wls':
396            raise NotImplementedError('WLS regression not implemented yet')
397
398        elif method.lower()=='york':
399            raise NotImplementedError('York regression not implemented yet')
400
401        else:
402            raise ValueError('Undefined method '+method)
403
404        line = dict( slope          = slope,
405                     intercept      = intercept,
406                     fittedvalues   = slope * self._x + intercept,
407                     residuals      = self._y - ( slope * self._x + intercept ),
408                     method         = method,
409                     fitintercept   = fitintercept )
410
411        return line
412
413    def slope(self,method='sma',intercept=True,**kwargs):
414        '''Compute slope of bivariate line fit
415        
416        Parameters
417        ----------
418        method : str
419            line fitting method: sma (default), ols, wls
420        intercept : bool
421            defines whether non-zero intercept should be fitted
422        **kwargs 
423            passed to `fitline`
424
425        Returns
426        -------
427        slope : float
428            value of y intercept
429        '''
430        return self.fitline(method,intercept,**kwargs)['slope']
431
432    def intercept(self,method='sma',intercept=True,**kwargs):
433        '''Compute intercept of bivariate line fit
434        
435        Parameters
436        ----------
437        method : str
438            line fitting method: sma (default) or ols
439        intercept : bool
440            defines whether non-zero intercept should be fitted
441        **kwargs 
442            passed to `fitline`
443
444        Returns
445        -------
446        intercept : float
447            value of y intercept
448        '''
449        return self.fitline(method,intercept,**kwargs)['intercept']
450
451    def _expand_variables(self,variables):
452        '''Expand special strings into a list of variables
453        
454        Parameter
455        ---------
456        variables : list or str, default='common'
457            Special strings ("all","common") will be expanded to a list of variables
458            list arguments will not be modified
459
460        Returns
461        -------
462        list 
463            variable names
464        '''
465        if variables is None:
466            variables='common'
467        if variables=='all':
468            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
469                       'MedD','MedAD','RMedD','RMedAD','MedRD',
470                       'NMBF','NMAEF','RMSD',
471                       'R','R2','spearmanr','slope','intercept',
472                       'fitline']
473        elif variables=='common':
474            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope']
475        if not isinstance(variables,list):
476            raise ValueError(
477                'variables must be a list, None, or one of these strings: "all","common"')
478
479        return variables
480
481    def summary_dict(self, variables=None,
482                     fitline_kw=None,
483                     floatformat_fiteqn='{:.3f}' ):
484        '''Summarize bivariate statistics into a dict
485
486        Parameters
487        ----------
488        vars : list or str, default='common'
489            names of attribute variables to include in summary
490            names are case insensitive            
491            The following strings are also accepted in place of a list 
492                "all" (displays all variables)
493                "common" (displays all measures of mean difference)
494        fitline_kw : dict, default=None)
495            keywords passed to self.fitline()
496        
497        Returns
498        -------
499        summary : dict
500            names and values of variables
501        '''
502
503        # List of variables
504        variables = self._expand_variables(variables)
505
506        if fitline_kw is None:
507            fitline_kw = {'method':'sma',
508                          'intercept':True}
509
510        # Construct the dict
511        summary = {}
512        for v in variables:
513            if v in ['slope','intercept']:
514                # These variables are object methods
515                func = getattr(self,v)
516                value = func(**fitline_kw)
517            elif v == 'fitline':
518                line = self.fitline(**fitline_kw)
519                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
520            else:
521                # Retrieve values
522                value = getattr(self,v.lower())
523
524            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
525            summary[v] = value
526
527        return summary
528
529    def summary(self, variables=None, fitline_kw=None,
530                floatformat='{:.4f}', floatformat_fiteqn=None,
531                stringlength=None ):
532        '''Summarize bivariate statistics
533
534        Parameters
535        ----------
536        vars : list or str, default='common'
537            names of attribute variables to include in summary
538            names are case insensitive            
539            The following strings are also accepted in place of a list 
540                "all" (displays all variables)
541                "common" (displays all measures of mean difference)
542        floatformat : str, default='{:.4f}'
543            format specifier for floating point values
544        floatformat_fiteqn : str, default=floatformat
545            format specifier for slope and intercept (a,b) in y = a x + b
546        stringlength : int, default=None
547            length of the variables on output
548            default (None) is to use the length of the longest variable name
549        fitline_kw : dict, default=None
550            keywords passed to `fitline`
551        
552        Returns
553        -------
554        summary : str
555            names and values of variables
556        '''
557        # List of variables
558        variables = self._expand_variables(variables)
559
560        if floatformat_fiteqn is None:
561            floatformat_fiteqn = floatformat
562        if stringlength is None:
563            stringlength = np.max([len(v) for v in variables])
564        stringformat = '{:'+str(stringlength)+'s}'
565
566        # Get a dict containing the needed variables
567        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
568
569        # Extract length of the float numbers from floatformat
570        # import re
571        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
572        #       floatformat )[0] ) ).astype(int)
573
574        # summary = (stringformat+'{:>10s}').format('Variable','Value')
575        summarytext = ''
576        for k,v in summarydict.items():
577            if isinstance(v,str):
578                summarytext += (stringformat+' = {:s}\n').format(k,v)
579            else:
580                summarytext += (stringformat+' = '+floatformat+'\n').format(k,v)
581
582        return summarytext
583
584    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
585                          floatformat='{:.3f}', floatformat_fiteqn=None,
586                          loc=None, loc_units='axes',
587                          **kwargs):
588        '''Display bivariate statistics as a table inset on a plot axis
589
590        Parameters
591        ----------
592        ax : matplotlib.Figure.Axis 
593            axis where the table will be displayed
594        variables : list or str, default='common'
595            names of attribute variables to include in summary
596            names are case insensitive            
597            The following strings are also accepted in place of a list 
598                "all" (displays all variables)
599                "common" (displays all measures of mean difference)
600        fitline_kw : dict, default=None
601            keywords passed to `fitline`
602        floatformat : str, default='{:.3f}'
603            format specifier for floating point values
604        floatformat_fiteqn : str, default=floatformat
605            format specifier for slope and intercept (a,b) in y = a x + b
606        loc : tuple (x0,y0), default=(0.85, 0.05)
607            location on the axis where the table will be drawn
608            can be in data units or axes units [0-1]
609        loc_units : {'axes' (default), 'data'}
610            specifies whether loc has 'data' units or 'axes' units [0-1]
611                    
612        Returns
613        -------
614        text1, text2 : matplotlib text object
615            Artist for the two text boxes        
616        '''
617        # List of variables
618        variables = self._expand_variables(variables)
619
620        if floatformat_fiteqn is None:
621            floatformat_fiteqn = floatformat
622
623        # Default location in lower right corner
624        if loc is None:
625            loc = (0.8,0.05)
626
627        # Coordinates for loc
628        if loc_units.lower()=='data':
629            coord=ax.transData
630        elif loc_units.lower() in ['axes','axis']:
631            coord=ax.transAxes
632        else:
633            raise ValueError('Display units should be "Data" or "Axes"')
634
635        # Get a dict containing the needed variables
636        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
637
638        # Column of label text
639        label_text = '\n'.join([_texify_name(key) for key in summarydict])
640        # Column of value text
641        value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value)
642                                for value in summarydict.values()])
643
644        # Check if horizontal alignment keyword is used
645        ha=''
646        try:
647            ha = kwargs['ha']
648        except KeyError:
649            pass
650        try:
651            ha = kwargs['horizontalalignment']
652        except KeyError:
653            pass
654
655        # For right alignment, align on values first
656        # Otherwise, align on labels
657        if ha=='right':
658            first_text = value_text
659            second_text = label_text
660            sign = -1
661        else:
662            first_text = label_text
663            second_text = value_text
664            sign = +1
665
666        # Add first column of text
667        t1=ax.text(loc[0],loc[1],
668                first_text,
669                transform=coord,
670                **kwargs
671                )
672
673        # Get width of first text column
674        bbox = t1.get_window_extent().transformed(coord.inverted())
675        width = bbox.x1-bbox.x0
676
677        # Add second column of text
678        t2 = ax.text(loc[0]+width*sign,loc[1],
679                     second_text,
680                     transform=coord,
681                     **kwargs
682                     )
683
684        ##################################
685        # Early version of this function using matplotlib.table.table()
686
687        # if isinstance(loc,(tuple,list)):
688        #     # Create an inset axis to contain the table
689        #     tableaxis = ax.inset_axes(loc)
690        #     table_width=1
691        # else:
692        #     tableaxis = ax
693
694        # # Display the table on the axis
695        # return mtable.table(
696        #     tableaxis,
697        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
698        #     rowLabels=[texify_name(key) for key in summarydict],
699        #     colWidths=[table_width/2]*2,
700        #     edges=edges,
701        #     loc=loc, bbox=bbox
702        #     )
703
704        return [t1,t2]

A suite of common statistics to quantify bivariate relationships

Class method 'summary' provides a formatted summary of these statistics

Attributes
  • xmean, ymean (float): mean of x and y variables
  • xmedian, ymedian (float): median of x and y variables
  • xstd, ystd (float): standard deviation of x and y variables
  • mean_difference, md (float): ymean - xmean
  • mean_absolute_difference, mad (float): mean( |y-x| )
  • relative_mean_difference, rmd (float): md / xmean
  • relative_mean_absolute_difference, rmad (float): mad / xmean
  • standardized_mean_difference, smd (float): md / xstd
  • standardized_mean_absolute_difference, smad (float): mad /xstd
  • mean_relative_difference, mrd (float): mean(y/x) - 1
  • mean_log10_ratio, mlr (float): mean( log10(y/x) )
  • mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
  • median_difference, medd (float): median(y-x)
  • median_absolute_difference, medad (float): median(|y-x|)
  • relative_median_difference, rmedd (float): median(y-x) / xmedian
  • relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
  • median_relative_difference, medianrd, medrd (float): median(y/x)-1
  • median_log10_ratio, medlr (float): median( log10(y/x) )
  • median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
  • normalized_mean_bias_factor, nmbf (float): see nmbf
  • normalized_mean_absolute_error_factor, nmaef (float): see nmaef
  • root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
  • covariance (float): cov(x,y)
  • correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
  • correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
  • R2, r2 (float): Linear coefficient of determination, $R^2$
BivariateStatistics(x, y, w=None, dropna=False, data=None)
221    def __init__(self,x,y,w=None,dropna=False,data=None):
222        '''Compute suite of bivariate statistics during initialization
223        
224        Statistic values are saved in attributes.
225        CAUTION: Weights w are ignored except in SMA fit
226
227        Parameters
228        ----------
229        x : ndarray or str
230            independent variable values
231        y : ndarray or str
232            dependent variable values, same size as x
233        w : ndarray or str, optional
234            weights for points (x,y), same size as x and y
235        dropna : bool, optional (default=False)
236            drops NaN values from x, y, and w
237        data : dict-like, optional
238            if x, y, or w are str, then they should be keys in data
239        '''
240
241        # Get values from data if needed
242        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
243            raise ValueError( 'Data argument must be used if x, y, or w is a string')
244        if isinstance(x,str):
245            x = data[x]
246        if isinstance(y,str):
247            y = data[y]
248        if isinstance(w,str):
249            w = data[w]
250
251        #Ensure that x and y have same length
252        if len(x) != len(y):
253            raise ValueError( 'Arguments x and y must have the same length' )
254        if w is None:
255            w = np.ones_like(x)
256        if len(w) != len(x):
257            raise ValueError( 'Argument w (if present) must have the same length as x' )
258
259        # Drop NaN values
260        if dropna:
261            isna = np.isnan(x*y*w)
262            x = x[~isna]
263            y = y[~isna]
264            w = w[~isna]
265
266        diff = y - x
267        absdiff = np.abs( y - x )
268        # Ignore divide by zero and 0/0 while dividing
269        old_settings = np.seterr(divide='ignore',invalid='ignore')
270        ratio = y/x
271        log10ratio = np.log10(ratio)
272        np.seterr(**old_settings)
273
274        # Means, medians, and standard deviations
275        self.xmean = np.mean(x)
276        self.ymean = np.mean(y)
277        self.xmedian = np.median(x)
278        self.ymedian = np.median(y)
279        self.xstd   = np.std(x)
280        self.ystd   = np.std(y)
281
282        # Save values for use later
283        self._x = x
284        self._y = y
285        self._w = w
286
287        # Mean and mean absolute differences
288        self.mean_difference            = self.md   = self.ymean - self.xmean
289        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
290
291        # Relative and standardized differences
292        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
293        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
294        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
295        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
296
297        # Mean and median relative differences
298        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
299        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
300
301        # Median and median absolute differences
302        self.median_difference          = self.medd  = np.median( diff )
303        self.median_absolute_difference = self.medad = np.median( absdiff )
304
305        # Relative median differences
306        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
307        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
308
309        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
310        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
311
312        # Mean and mean absolute log ratio
313        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
314        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
315        
316        # Median and median absolute log ratio
317        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
318        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
319        
320        # RMS difference
321        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
322
323        # Covariance, correlation
324        self.covariance = np.cov(x,y)[0][1]
325        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
326            np.corrcoef(x,y)[0][1]
327        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
328        self.R2 = self.r2 = self.R**2

Compute suite of bivariate statistics during initialization

Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit

Parameters
  • x (ndarray or str): independent variable values
  • y (ndarray or str): dependent variable values, same size as x
  • w (ndarray or str, optional): weights for points (x,y), same size as x and y
  • dropna (bool, optional (default=False)): drops NaN values from x, y, and w
  • data (dict-like, optional): if x, y, or w are str, then they should be keys in data
xmean
ymean
xmedian
ymedian
xstd
ystd
covariance
def fitline(self, method='sma', intercept=True, **kwargs):
334    def fitline(self,method='sma',intercept=True,**kwargs):
335        '''Compute bivariate line fit
336        
337        Parameters
338        ----------
339        method : str
340            line fitting method: sma (default), ols, wls, York, sen, siegel
341        intercept : bool
342            defines whether non-zero intercept should be fitted
343        **kwargs 
344            passed to `acgc.stats.sma` (e.g. robust=True)
345
346        Returns
347        -------
348        result : dict
349            dictionary with keys:
350            - slope (float)
351                slope of fitted line
352            - intercept (float)
353                intercept of fitted line
354            - fittedvalues (array (N,))
355                values on fit line
356            - residuals (array (N,))
357                residual from fit line
358        '''
359
360        fitintercept = intercept
361
362        if method.lower()=='sma':
363            fit = sma(  self._x,
364                        self._y,
365                        self._w,
366                        intercept=fitintercept,
367                        **kwargs)
368            slope = fit['slope']
369            intercept= fit['intercept']
370
371        elif method.lower()=='ols':
372            if fitintercept:
373                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
374                                      self._y, rcond=None )
375            else:
376                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
377            slope = ols[0][0]
378            intercept = ols[0][1]
379
380        elif method.lower() in ['theil','sen','theilsen']:
381            fitintercept = True
382            fit = sen( self._x,
383                       self._y,
384                       **kwargs)
385            slope = fit.slope
386            intercept = fit.intercept
387
388        elif method.lower()=='siegel':
389            fitintercept = True
390            siegel = stats.siegelslopes( self._x,
391                                         self._y )
392            slope = siegel.slope
393            intercept = siegel.intercept
394
395        elif method.lower()=='wls':
396            raise NotImplementedError('WLS regression not implemented yet')
397
398        elif method.lower()=='york':
399            raise NotImplementedError('York regression not implemented yet')
400
401        else:
402            raise ValueError('Undefined method '+method)
403
404        line = dict( slope          = slope,
405                     intercept      = intercept,
406                     fittedvalues   = slope * self._x + intercept,
407                     residuals      = self._y - ( slope * self._x + intercept ),
408                     method         = method,
409                     fitintercept   = fitintercept )
410
411        return line

Compute bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to acgc.stats.sma (e.g. robust=True)
Returns
  • result (dict): dictionary with keys:
    • slope (float) slope of fitted line
    • intercept (float) intercept of fitted line
    • fittedvalues (array (N,)) values on fit line
    • residuals (array (N,)) residual from fit line
def slope(self, method='sma', intercept=True, **kwargs):
413    def slope(self,method='sma',intercept=True,**kwargs):
414        '''Compute slope of bivariate line fit
415        
416        Parameters
417        ----------
418        method : str
419            line fitting method: sma (default), ols, wls
420        intercept : bool
421            defines whether non-zero intercept should be fitted
422        **kwargs 
423            passed to `fitline`
424
425        Returns
426        -------
427        slope : float
428            value of y intercept
429        '''
430        return self.fitline(method,intercept,**kwargs)['slope']

Compute slope of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • slope (float): value of y intercept
def intercept(self, method='sma', intercept=True, **kwargs):
432    def intercept(self,method='sma',intercept=True,**kwargs):
433        '''Compute intercept of bivariate line fit
434        
435        Parameters
436        ----------
437        method : str
438            line fitting method: sma (default) or ols
439        intercept : bool
440            defines whether non-zero intercept should be fitted
441        **kwargs 
442            passed to `fitline`
443
444        Returns
445        -------
446        intercept : float
447            value of y intercept
448        '''
449        return self.fitline(method,intercept,**kwargs)['intercept']

Compute intercept of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default) or ols
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • intercept (float): value of y intercept
def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
481    def summary_dict(self, variables=None,
482                     fitline_kw=None,
483                     floatformat_fiteqn='{:.3f}' ):
484        '''Summarize bivariate statistics into a dict
485
486        Parameters
487        ----------
488        vars : list or str, default='common'
489            names of attribute variables to include in summary
490            names are case insensitive            
491            The following strings are also accepted in place of a list 
492                "all" (displays all variables)
493                "common" (displays all measures of mean difference)
494        fitline_kw : dict, default=None)
495            keywords passed to self.fitline()
496        
497        Returns
498        -------
499        summary : dict
500            names and values of variables
501        '''
502
503        # List of variables
504        variables = self._expand_variables(variables)
505
506        if fitline_kw is None:
507            fitline_kw = {'method':'sma',
508                          'intercept':True}
509
510        # Construct the dict
511        summary = {}
512        for v in variables:
513            if v in ['slope','intercept']:
514                # These variables are object methods
515                func = getattr(self,v)
516                value = func(**fitline_kw)
517            elif v == 'fitline':
518                line = self.fitline(**fitline_kw)
519                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
520            else:
521                # Retrieve values
522                value = getattr(self,v.lower())
523
524            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
525            summary[v] = value
526
527        return summary

Summarize bivariate statistics into a dict

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None)): keywords passed to self.fitline()
Returns
  • summary (dict): names and values of variables
def summary( self, variables=None, fitline_kw=None, floatformat='{:.4f}', floatformat_fiteqn=None, stringlength=None):
529    def summary(self, variables=None, fitline_kw=None,
530                floatformat='{:.4f}', floatformat_fiteqn=None,
531                stringlength=None ):
532        '''Summarize bivariate statistics
533
534        Parameters
535        ----------
536        vars : list or str, default='common'
537            names of attribute variables to include in summary
538            names are case insensitive            
539            The following strings are also accepted in place of a list 
540                "all" (displays all variables)
541                "common" (displays all measures of mean difference)
542        floatformat : str, default='{:.4f}'
543            format specifier for floating point values
544        floatformat_fiteqn : str, default=floatformat
545            format specifier for slope and intercept (a,b) in y = a x + b
546        stringlength : int, default=None
547            length of the variables on output
548            default (None) is to use the length of the longest variable name
549        fitline_kw : dict, default=None
550            keywords passed to `fitline`
551        
552        Returns
553        -------
554        summary : str
555            names and values of variables
556        '''
557        # List of variables
558        variables = self._expand_variables(variables)
559
560        if floatformat_fiteqn is None:
561            floatformat_fiteqn = floatformat
562        if stringlength is None:
563            stringlength = np.max([len(v) for v in variables])
564        stringformat = '{:'+str(stringlength)+'s}'
565
566        # Get a dict containing the needed variables
567        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
568
569        # Extract length of the float numbers from floatformat
570        # import re
571        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
572        #       floatformat )[0] ) ).astype(int)
573
574        # summary = (stringformat+'{:>10s}').format('Variable','Value')
575        summarytext = ''
576        for k,v in summarydict.items():
577            if isinstance(v,str):
578                summarytext += (stringformat+' = {:s}\n').format(k,v)
579            else:
580                summarytext += (stringformat+' = '+floatformat+'\n').format(k,v)
581
582        return summarytext

Summarize bivariate statistics

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • floatformat : str, default='{ (.4f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
  • fitline_kw (dict, default=None): keywords passed to fitline
Returns
  • summary (str): names and values of variables
def summary_fig_inset( self, ax, variables=None, fitline_kw=None, floatformat='{:.3f}', floatformat_fiteqn=None, loc=None, loc_units='axes', **kwargs):
584    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
585                          floatformat='{:.3f}', floatformat_fiteqn=None,
586                          loc=None, loc_units='axes',
587                          **kwargs):
588        '''Display bivariate statistics as a table inset on a plot axis
589
590        Parameters
591        ----------
592        ax : matplotlib.Figure.Axis 
593            axis where the table will be displayed
594        variables : list or str, default='common'
595            names of attribute variables to include in summary
596            names are case insensitive            
597            The following strings are also accepted in place of a list 
598                "all" (displays all variables)
599                "common" (displays all measures of mean difference)
600        fitline_kw : dict, default=None
601            keywords passed to `fitline`
602        floatformat : str, default='{:.3f}'
603            format specifier for floating point values
604        floatformat_fiteqn : str, default=floatformat
605            format specifier for slope and intercept (a,b) in y = a x + b
606        loc : tuple (x0,y0), default=(0.85, 0.05)
607            location on the axis where the table will be drawn
608            can be in data units or axes units [0-1]
609        loc_units : {'axes' (default), 'data'}
610            specifies whether loc has 'data' units or 'axes' units [0-1]
611                    
612        Returns
613        -------
614        text1, text2 : matplotlib text object
615            Artist for the two text boxes        
616        '''
617        # List of variables
618        variables = self._expand_variables(variables)
619
620        if floatformat_fiteqn is None:
621            floatformat_fiteqn = floatformat
622
623        # Default location in lower right corner
624        if loc is None:
625            loc = (0.8,0.05)
626
627        # Coordinates for loc
628        if loc_units.lower()=='data':
629            coord=ax.transData
630        elif loc_units.lower() in ['axes','axis']:
631            coord=ax.transAxes
632        else:
633            raise ValueError('Display units should be "Data" or "Axes"')
634
635        # Get a dict containing the needed variables
636        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
637
638        # Column of label text
639        label_text = '\n'.join([_texify_name(key) for key in summarydict])
640        # Column of value text
641        value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value)
642                                for value in summarydict.values()])
643
644        # Check if horizontal alignment keyword is used
645        ha=''
646        try:
647            ha = kwargs['ha']
648        except KeyError:
649            pass
650        try:
651            ha = kwargs['horizontalalignment']
652        except KeyError:
653            pass
654
655        # For right alignment, align on values first
656        # Otherwise, align on labels
657        if ha=='right':
658            first_text = value_text
659            second_text = label_text
660            sign = -1
661        else:
662            first_text = label_text
663            second_text = value_text
664            sign = +1
665
666        # Add first column of text
667        t1=ax.text(loc[0],loc[1],
668                first_text,
669                transform=coord,
670                **kwargs
671                )
672
673        # Get width of first text column
674        bbox = t1.get_window_extent().transformed(coord.inverted())
675        width = bbox.x1-bbox.x0
676
677        # Add second column of text
678        t2 = ax.text(loc[0]+width*sign,loc[1],
679                     second_text,
680                     transform=coord,
681                     **kwargs
682                     )
683
684        ##################################
685        # Early version of this function using matplotlib.table.table()
686
687        # if isinstance(loc,(tuple,list)):
688        #     # Create an inset axis to contain the table
689        #     tableaxis = ax.inset_axes(loc)
690        #     table_width=1
691        # else:
692        #     tableaxis = ax
693
694        # # Display the table on the axis
695        # return mtable.table(
696        #     tableaxis,
697        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
698        #     rowLabels=[texify_name(key) for key in summarydict],
699        #     colWidths=[table_width/2]*2,
700        #     edges=edges,
701        #     loc=loc, bbox=bbox
702        #     )
703
704        return [t1,t2]

Display bivariate statistics as a table inset on a plot axis

Parameters
  • ax (matplotlib.Figure.Axis): axis where the table will be displayed
  • variables (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • floatformat : str, default='{ (.3f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
  • loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
  • text1, text2 (matplotlib text object): Artist for the two text boxes
def nmb(x0, x1):
22def nmb( x0, x1 ):
23    '''Compute Normalized Mean Bias (NMB)
24
25    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
26
27    Parameters
28    ----------
29    x0 : array_like
30        reference values
31    x1 : array_like
32        experiment values
33    '''
34
35    assert (len(x0) == len(x1)), \
36        "Parameters x0 and x1 must have the same length"
37
38    # Mean values
39    x0_mean = np.mean(x0)
40    x1_mean = np.mean(x1)
41
42    # Metric value
43    return x1_mean / x0_mean - 1

Compute Normalized Mean Bias (NMB)

NMB = ( mean(x1) - mean(x0) ) / mean(x0)

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmae(x0, x1):
45def nmae( x0, x1 ):
46    '''Compute Normalized Mean Absolute Error (NMAE)
47
48    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
49
50    Parameters
51    ---------
52    x0 : array_like
53        reference values
54    x1 : array_like
55        experiment values
56    '''
57
58     # Mean values
59    x0_mean = np.mean(x0)
60
61    # Mean absolute difference
62    abs_diff = np.mean( np.abs(x1 - x0) )
63
64    # Metric value
65    return abs_diff / np.abs( x0_mean )

Compute Normalized Mean Absolute Error (NMAE)

NMAE = mean(abs(x1 - x0)) / abs(mean(x0))

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmbf(x0, x1):
68def nmbf( x0, x1 ):
69    '''Compute Normalized Mean Bias Factor (NMBF)
70
71    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
72
73    Parameters
74    ----------
75    x0 : array_like
76        reference values
77    x1 : array_like
78        experiment values
79    '''
80
81    # Ensure that arguments have the same length
82    assert (len(x0) == len(x1)), \
83        "Parameters x0 and x1 must have the same length"
84
85    # Mean values
86    x0_mean = np.mean(x0)
87    x1_mean = np.mean(x1)
88
89    # Metric value
90    if x1_mean >= x0_mean:
91        result = x1_mean / x0_mean - 1
92    else:
93        result= 1 - x0_mean / x1_mean
94    # Equivalent (faster?) implementation
95    #S = (mMean - oMean) / np.abs(mMean - oMean)
96    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
97
98    return result

Compute Normalized Mean Bias Factor (NMBF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmaef(x0, x1):
100def nmaef( x0, x1 ):
101    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
102
103    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
104    
105    Parameters
106    ----------
107    x0 : array_like
108        reference values
109    x1 : array_like
110        experiment values
111    '''
112
113    # Ensure that arguments have the same length
114    assert (len(x0) == len(x1)), \
115        "Parameters x0 and x1 must have the same length"
116
117    # Mean values
118    x0_mean = np.mean(x0)
119    x1_mean = np.mean(x1)
120
121    # Mean absolute difference
122    abs_diff = np.mean( np.abs(x1 - x0))
123
124    # Metric value
125    if x1_mean >= x0_mean:
126        result = abs_diff / x0_mean 
127    else:
128        result = abs_diff / x1_mean
129    # Equivalent (faster?) implementation
130    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
131    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
132
133    return result

Compute Normalized Mean Absolute Error Factor (NMAEF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values