acgc.stats.bivariate

Bivariate statistics

Statistical measures of relationships between two populations

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3""" Bivariate statistics
  4
  5Statistical measures of relationships between two populations
  6"""
  7
  8import numpy as np
  9from scipy import stats
 10from .bivariate_lines import sen, sma, bivariate_line_equation
 11# import xarray as xr
 12
 13__all__ = [
 14    "BivariateStatistics",
 15    "nmb",
 16    "nmae",
 17    "nmbf",
 18    "nmaef"
 19]
 20
 21def nmb( x0, x1 ):
 22    '''Compute Normalized Mean Bias (NMB)
 23
 24    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
 25
 26    Parameters
 27    ----------
 28    x0 : array_like
 29        reference values
 30    x1 : array_like
 31        experiment values
 32    '''
 33
 34    assert (len(x0) == len(x1)), \
 35        "Parameters x0 and x1 must have the same length"
 36
 37    # Mean values
 38    x0_mean = np.mean(x0)
 39    x1_mean = np.mean(x1)
 40
 41    # Metric value
 42    return x1_mean / x0_mean - 1
 43
 44def nmae( x0, x1 ):
 45    '''Compute Normalized Mean Absolute Error (NMAE)
 46
 47    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
 48
 49    Parameters
 50    ---------
 51    x0 : array_like
 52        reference values
 53    x1 : array_like
 54        experiment values
 55    '''
 56
 57     # Mean values
 58    x0_mean = np.mean(x0)
 59
 60    # Mean absolute difference
 61    abs_diff = np.mean( np.abs(x1 - x0) )
 62
 63    # Metric value
 64    return abs_diff / np.abs( x0_mean )
 65
 66
 67def nmbf( x0, x1 ):
 68    '''Compute Normalized Mean Bias Factor (NMBF)
 69
 70    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
 71
 72    Parameters
 73    ----------
 74    x0 : array_like
 75        reference values
 76    x1 : array_like
 77        experiment values
 78    '''
 79
 80    # Ensure that arguments have the same length
 81    assert (len(x0) == len(x1)), \
 82        "Parameters x0 and x1 must have the same length"
 83
 84    # Mean values
 85    x0_mean = np.mean(x0)
 86    x1_mean = np.mean(x1)
 87
 88    # Metric value
 89    if x1_mean >= x0_mean:
 90        result = x1_mean / x0_mean - 1
 91    else:
 92        result= 1 - x0_mean / x1_mean
 93    # Equivalent (faster?) implementation
 94    #S = (mMean - oMean) / np.abs(mMean - oMean)
 95    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
 96
 97    return result
 98
 99def nmaef( x0, x1 ):
100    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
101
102    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
103    
104    Parameters
105    ----------
106    x0 : array_like
107        reference values
108    x1 : array_like
109        experiment values
110    '''
111
112    # Ensure that arguments have the same length
113    assert (len(x0) == len(x1)), \
114        "Parameters x0 and x1 must have the same length"
115
116    # Mean values
117    x0_mean = np.mean(x0)
118    x1_mean = np.mean(x1)
119
120    # Mean absolute difference
121    abs_diff = np.mean( np.abs(x1 - x0))
122
123    # Metric value
124    if x1_mean >= x0_mean:
125        result = abs_diff / x0_mean 
126    else:
127        result = abs_diff / x1_mean
128    # Equivalent (faster?) implementation
129    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
130    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
131
132    return result
133
134def _texify_name(name):
135    '''Return a LaTex formatted string for some variables
136    
137    Parameter
138    ---------
139    name : str
140    
141    Returns
142    -------
143    pretty_name : str
144    '''
145    if name=='R2':
146        pretty_name = f'$R^2$'
147    elif name=='r2':
148        pretty_name = f'$r^2$'
149    elif name.lower()=='y_ols':
150        pretty_name = r'$y_{\rm OLS}$'
151    elif name.lower()=='y_sma':
152        pretty_name = r'$y_{\rm SMA}$'
153    elif name.lower()=='y_sen':
154        pretty_name = r'$y_{\rm Sen}$'
155    else:
156        pretty_name = name
157    return pretty_name
158
159class BivariateStatistics:
160    '''A suite of common statistics to quantify bivariate relationships
161
162    Class method 'summary' provides a formatted summary of these statistics
163    
164    Attributes
165    ----------
166    xmean, ymean : float
167        mean of x and y variables
168    xmedian, ymedian :float
169        median of x and y variables
170    xstd, ystd : float
171        standard deviation of x and y variables
172    mean_difference, md : float
173        ymean - xmean
174    mean_absolute_difference, mad : float
175        mean( |y-x| )
176    relative_mean_difference, rmd : float
177        md / xmean
178    relative_mean_absolute_difference, rmad :float
179        mad / xmean
180    standardized_mean_difference, smd : float
181        md / xstd
182    standardized_mean_absolute_difference, smad : float
183        mad /xstd
184    mean_relative_difference, mrd : float
185        mean(y/x) - 1
186    median_difference, medd : float
187        median(y-x)
188    median_absolute_difference, medad : float
189        median(|y-x|)
190    relative_median_difference, rmedd : float
191        median(y-x) / xmedian
192    relative_median_absolute_difference, rmedad : float
193        median(|y-x|) / xmedian
194    median_relative_difference, medianrd, medrd : float
195        median(y/x)-1
196    normalized_mean_bias_factor, nmbf : float
197        see `nmbf` 
198    normalized_mean_absolute_error_factor, nmaef : float
199        see `nmaef`
200    root_mean_square_difference, rmsd : float
201        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
202    covariance : float
203        cov(x,y)
204    correlation_pearson, correlation, pearsonr, R, r : float
205        Pearson linear correlation coefficient 
206    correlation_spearman, spearmanr : float
207        Spearman, non-parametric rank correlation coefficient
208    R2, r2 : float
209        Linear coefficient of determination, $R^2$
210    '''
211
212    def __init__(self,x,y,w=None,dropna=False,data=None):
213        '''Compute suite of bivariate statistics during initialization
214        
215        Statistic values are saved in attributes.
216        CAUTION: Weights w are ignored except in SMA fit
217
218        Parameters
219        ----------
220        x : ndarray or str
221            independent variable values
222        y : ndarray or str
223            dependent variable values, same size as x
224        w : ndarray or str, optional
225            weights for points (x,y), same size as x and y
226        dropna : bool, optional (default=False)
227            drops NaN values from x, y, and w
228        data : dict-like, optional
229            if x, y, or w are str, then they should be keys in data
230        '''
231
232        # Get values from data if needed
233        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
234            raise ValueError( 'Data argument must be used if x, y, or w is a string')
235        if isinstance(x,str):
236            x = data[x]
237        if isinstance(y,str):
238            y = data[y]
239        if isinstance(w,str):
240            w = data[w]
241
242        #Ensure that x and y have same length
243        if len(x) != len(y):
244            raise ValueError( 'Arguments x and y must have the same length' )
245        if w is None:
246            w = np.ones_like(x)
247        if len(w) != len(x):
248            raise ValueError( 'Argument w (if present) must have the same length as x' )
249
250        # Drop NaN values
251        if dropna:
252            isna = np.isnan(x*y*w)
253            x = x[~isna]
254            y = y[~isna]
255            w = w[~isna]
256
257        diff = y - x
258        absdiff = np.abs( y - x )
259        # Ignore divide by zero and 0/0 while dividing
260        old_settings = np.seterr(divide='ignore',invalid='ignore')
261        ratio = y/x
262        np.seterr(**old_settings)
263
264        # Means, medians, and standard deviations
265        self.xmean = np.mean(x)
266        self.ymean = np.mean(y)
267        self.xmedian = np.median(x)
268        self.ymedian = np.median(y)
269        self.xstd   = np.std(x)
270        self.ystd   = np.std(y)
271
272        # Save values for use later
273        self._x = x
274        self._y = y
275        self._w = w
276
277        # Mean and mean absolute differences
278        self.mean_difference            = self.md   = self.ymean - self.xmean
279        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
280
281        # Relative and standardized differences
282        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
283        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
284        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
285        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
286
287        # Mean and median relative differences
288        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
289        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
290
291        # Median and median absolute differences
292        self.median_difference          = self.medd  = np.median( diff )
293        self.median_absolute_difference = self.medad = np.median( absdiff )
294
295        # Relative median differences
296        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
297        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
298
299        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
300        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
301
302        # RMS difference
303        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
304
305        # Covariance, correlation
306        self.covariance = np.cov(x,y)[0][1]
307        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
308            np.corrcoef(x,y)[0][1]
309        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
310        self.R2 = self.r2 = self.R**2
311
312    def __getitem__(self,key):
313        '''Accesses attribute values via object['key']'''
314        return getattr(self,key)
315
316    def fitline(self,method='sma',intercept=True,**kwargs):
317        '''Compute bivariate line fit
318        
319        Parameters
320        ----------
321        method : str
322            line fitting method: sma (default), ols, wls, York, sen, siegel
323        intercept : bool
324            defines whether non-zero intercept should be fitted
325        **kwargs 
326            passed to `acgc.stats.sma` (e.g. robust=True)
327
328        Returns
329        -------
330        result : dict
331            dictionary with keys:
332            - slope (float)
333                slope of fitted line
334            - intercept (float)
335                intercept of fitted line
336            - fittedvalues (array (N,))
337                values on fit line
338            - residuals (array (N,))
339                residual from fit line
340        '''
341
342        fitintercept = intercept
343
344        if method.lower()=='sma':
345            fit = sma(  self._x,
346                        self._y,
347                        self._w,
348                        intercept=fitintercept,
349                        **kwargs)
350            slope = fit['slope']
351            intercept= fit['intercept']
352
353        elif method.lower()=='ols':
354            if fitintercept:
355                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
356                                      self._y, rcond=None )
357            else:
358                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
359            slope = ols[0][0]
360            intercept = ols[0][1]
361
362        elif method.lower() in ['theil','sen','theilsen']:
363            fitintercept = True
364            fit = sen( self._x,
365                       self._y,
366                       **kwargs)
367            slope = fit.slope
368            intercept = fit.intercept
369
370        elif method.lower()=='siegel':
371            fitintercept = True
372            siegel = stats.siegelslopes( self._x,
373                                         self._y )
374            slope = siegel.slope
375            intercept = siegel.intercept
376
377        elif method.lower()=='wls':
378            raise NotImplementedError('WLS regression not implemented yet')
379
380        elif method.lower()=='york':
381            raise NotImplementedError('York regression not implemented yet')
382
383        else:
384            raise ValueError('Undefined method '+method)
385
386        line = dict( slope          = slope,
387                     intercept      = intercept,
388                     fittedvalues   = slope * self._x + intercept,
389                     residuals      = self._y - ( slope * self._x + intercept ),
390                     method         = method,
391                     fitintercept   = fitintercept )
392
393        return line
394
395    def slope(self,method='sma',intercept=True,**kwargs):
396        '''Compute slope of bivariate line fit
397        
398        Parameters
399        ----------
400        method : str
401            line fitting method: sma (default), ols, wls
402        intercept : bool
403            defines whether non-zero intercept should be fitted
404        **kwargs 
405            passed to `fitline`
406
407        Returns
408        -------
409        slope : float
410            value of y intercept
411        '''
412        return self.fitline(method,intercept,**kwargs)['slope']
413
414    def intercept(self,method='sma',intercept=True,**kwargs):
415        '''Compute intercept of bivariate line fit
416        
417        Parameters
418        ----------
419        method : str
420            line fitting method: sma (default) or ols
421        intercept : bool
422            defines whether non-zero intercept should be fitted
423        **kwargs 
424            passed to `fitline`
425
426        Returns
427        -------
428        intercept : float
429            value of y intercept
430        '''
431        return self.fitline(method,intercept,**kwargs)['intercept']
432
433    def _expand_variables(self,variables):
434        '''Expand special strings into a list of variables
435        
436        Parameter
437        ---------
438        variables : list or str, default='common'
439            Special strings ("all","common") will be expanded to a list of variables
440            list arguments will not be modified
441
442        Returns
443        -------
444        list 
445            variable names
446        '''
447        if variables is None:
448            variables='common'
449        if variables=='all':
450            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
451                       'MedD','MedAD','RMedD','RMedAD','MedRD',
452                       'NMBF','NMAEF','RMSD',
453                       'R','R2','spearmanr','slope','intercept',
454                       'fitline']
455        elif variables=='common':
456            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope']
457        if not isinstance(variables,list):
458            raise ValueError(
459                'variables must be a list, None, or one of these strings: "all","common"')
460
461        return variables
462
463    def summary_dict(self, variables=None,
464                     fitline_kw=None,
465                     floatformat_fiteqn='{:.3f}' ):
466        '''Summarize bivariate statistics into a dict
467
468        Parameters
469        ----------
470        vars : list or str, default='common'
471            names of attribute variables to include in summary
472            names are case insensitive            
473            The following strings are also accepted in place of a list 
474                "all" (displays all variables)
475                "common" (displays all measures of mean difference)
476        fitline_kw : dict, default=None)
477            keywords passed to self.fitline()
478        
479        Returns
480        -------
481        summary : dict
482            names and values of variables
483        '''
484
485        # List of variables
486        variables = self._expand_variables(variables)
487
488        if fitline_kw is None:
489            fitline_kw = {'method':'sma',
490                          'intercept':True}
491
492        # Construct the dict
493        summary = {}
494        for v in variables:
495            if v in ['slope','intercept']:
496                # These variables are object methods
497                func = getattr(self,v)
498                value = func(**fitline_kw)
499            elif v == 'fitline':
500                line = self.fitline(**fitline_kw)
501                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
502            else:
503                # Retrieve values
504                value = getattr(self,v.lower())
505
506            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
507            summary[v] = value
508
509        return summary
510
511    def summary(self, variables=None, fitline_kw=None,
512                floatformat='{:.4f}', floatformat_fiteqn=None,
513                stringlength=None ):
514        '''Summarize bivariate statistics
515
516        Parameters
517        ----------
518        vars : list or str, default='common'
519            names of attribute variables to include in summary
520            names are case insensitive            
521            The following strings are also accepted in place of a list 
522                "all" (displays all variables)
523                "common" (displays all measures of mean difference)
524        floatformat : str, default='{:.4f}'
525            format specifier for floating point values
526        floatformat_fiteqn : str, default=floatformat
527            format specifier for slope and intercept (a,b) in y = a x + b
528        stringlength : int, default=None
529            length of the variables on output
530            default (None) is to use the length of the longest variable name
531        fitline_kw : dict, default=None
532            keywords passed to `fitline`
533        
534        Returns
535        -------
536        summary : str
537            names and values of variables
538        '''
539        # List of variables
540        variables = self._expand_variables(variables)
541
542        if floatformat_fiteqn is None:
543            floatformat_fiteqn = floatformat
544        if stringlength is None:
545            stringlength = np.max([len(v) for v in variables])
546        stringformat = '{:'+str(stringlength)+'s}'
547
548        # Get a dict containing the needed variables
549        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
550
551        # Extract length of the float numbers from floatformat
552        # import re
553        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
554        #       floatformat )[0] ) ).astype(int)
555
556        # summary = (stringformat+'{:>10s}').format('Variable','Value')
557        summarytext = ''
558        for k,v in summarydict.items():
559            if isinstance(v,str):
560                summarytext += (stringformat+' = {:s}\n').format(k,v)
561            else:
562                summarytext += (stringformat+' = '+floatformat+'\n').format(k,v)
563
564        return summarytext
565
566    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
567                          floatformat='{:.3f}', floatformat_fiteqn=None,
568                          loc=None, loc_units='axes',
569                          **kwargs):
570        '''Display bivariate statistics as a table inset on a plot axis
571
572        Parameters
573        ----------
574        ax : matplotlib.Figure.Axis 
575            axis where the table will be displayed
576        variables : list or str, default='common'
577            names of attribute variables to include in summary
578            names are case insensitive            
579            The following strings are also accepted in place of a list 
580                "all" (displays all variables)
581                "common" (displays all measures of mean difference)
582        fitline_kw : dict, default=None
583            keywords passed to `fitline`
584        floatformat : str, default='{:.3f}'
585            format specifier for floating point values
586        floatformat_fiteqn : str, default=floatformat
587            format specifier for slope and intercept (a,b) in y = a x + b
588        loc : tuple (x0,y0), default=(0.85, 0.05)
589            location on the axis where the table will be drawn
590            can be in data units or axes units [0-1]
591        loc_units : {'axes' (default), 'data'}
592            specifies whether loc has 'data' units or 'axes' units [0-1]
593                    
594        Returns
595        -------
596        text1, text2 : matplotlib text object
597            Artist for the two text boxes        
598        '''
599        # List of variables
600        variables = self._expand_variables(variables)
601
602        if floatformat_fiteqn is None:
603            floatformat_fiteqn = floatformat
604
605        # Default location in lower right corner
606        if loc is None:
607            loc = (0.8,0.05)
608
609        # Coordinates for loc
610        if loc_units.lower()=='data':
611            coord=ax.transData
612        elif loc_units.lower() in ['axes','axis']:
613            coord=ax.transAxes
614        else:
615            raise ValueError('Display units should be "Data" or "Axes"')
616
617        # Get a dict containing the needed variables
618        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
619
620        # Column of label text
621        label_text = '\n'.join([_texify_name(key) for key in summarydict])
622        # Column of value text
623        value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value)
624                                for value in summarydict.values()])
625
626        # Check if horizontal alignment keyword is used
627        ha=''
628        try:
629            ha = kwargs['ha']
630        except KeyError:
631            pass
632        try:
633            ha = kwargs['horizontalalignment']
634        except KeyError:
635            pass
636
637        # For right alignment, align on values first
638        # Otherwise, align on labels
639        if ha=='right':
640            first_text = value_text
641            second_text = label_text
642            sign = -1
643        else:
644            first_text = label_text
645            second_text = value_text
646            sign = +1
647
648        # Add first column of text
649        t1=ax.text(loc[0],loc[1],
650                first_text,
651                transform=coord,
652                **kwargs
653                )
654
655        # Get width of first text column
656        bbox = t1.get_window_extent().transformed(coord.inverted())
657        width = bbox.x1-bbox.x0
658
659        # Add second column of text
660        t2 = ax.text(loc[0]+width*sign,loc[1],
661                     second_text,
662                     transform=coord,
663                     **kwargs
664                     )
665
666        ##################################
667        # Early version of this function using matplotlib.table.table()
668
669        # if isinstance(loc,(tuple,list)):
670        #     # Create an inset axis to contain the table
671        #     tableaxis = ax.inset_axes(loc)
672        #     table_width=1
673        # else:
674        #     tableaxis = ax
675
676        # # Display the table on the axis
677        # return mtable.table(
678        #     tableaxis,
679        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
680        #     rowLabels=[texify_name(key) for key in summarydict],
681        #     colWidths=[table_width/2]*2,
682        #     edges=edges,
683        #     loc=loc, bbox=bbox
684        #     )
685
686        return [t1,t2]
class BivariateStatistics:
160class BivariateStatistics:
161    '''A suite of common statistics to quantify bivariate relationships
162
163    Class method 'summary' provides a formatted summary of these statistics
164    
165    Attributes
166    ----------
167    xmean, ymean : float
168        mean of x and y variables
169    xmedian, ymedian :float
170        median of x and y variables
171    xstd, ystd : float
172        standard deviation of x and y variables
173    mean_difference, md : float
174        ymean - xmean
175    mean_absolute_difference, mad : float
176        mean( |y-x| )
177    relative_mean_difference, rmd : float
178        md / xmean
179    relative_mean_absolute_difference, rmad :float
180        mad / xmean
181    standardized_mean_difference, smd : float
182        md / xstd
183    standardized_mean_absolute_difference, smad : float
184        mad /xstd
185    mean_relative_difference, mrd : float
186        mean(y/x) - 1
187    median_difference, medd : float
188        median(y-x)
189    median_absolute_difference, medad : float
190        median(|y-x|)
191    relative_median_difference, rmedd : float
192        median(y-x) / xmedian
193    relative_median_absolute_difference, rmedad : float
194        median(|y-x|) / xmedian
195    median_relative_difference, medianrd, medrd : float
196        median(y/x)-1
197    normalized_mean_bias_factor, nmbf : float
198        see `nmbf` 
199    normalized_mean_absolute_error_factor, nmaef : float
200        see `nmaef`
201    root_mean_square_difference, rmsd : float
202        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
203    covariance : float
204        cov(x,y)
205    correlation_pearson, correlation, pearsonr, R, r : float
206        Pearson linear correlation coefficient 
207    correlation_spearman, spearmanr : float
208        Spearman, non-parametric rank correlation coefficient
209    R2, r2 : float
210        Linear coefficient of determination, $R^2$
211    '''
212
213    def __init__(self,x,y,w=None,dropna=False,data=None):
214        '''Compute suite of bivariate statistics during initialization
215        
216        Statistic values are saved in attributes.
217        CAUTION: Weights w are ignored except in SMA fit
218
219        Parameters
220        ----------
221        x : ndarray or str
222            independent variable values
223        y : ndarray or str
224            dependent variable values, same size as x
225        w : ndarray or str, optional
226            weights for points (x,y), same size as x and y
227        dropna : bool, optional (default=False)
228            drops NaN values from x, y, and w
229        data : dict-like, optional
230            if x, y, or w are str, then they should be keys in data
231        '''
232
233        # Get values from data if needed
234        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
235            raise ValueError( 'Data argument must be used if x, y, or w is a string')
236        if isinstance(x,str):
237            x = data[x]
238        if isinstance(y,str):
239            y = data[y]
240        if isinstance(w,str):
241            w = data[w]
242
243        #Ensure that x and y have same length
244        if len(x) != len(y):
245            raise ValueError( 'Arguments x and y must have the same length' )
246        if w is None:
247            w = np.ones_like(x)
248        if len(w) != len(x):
249            raise ValueError( 'Argument w (if present) must have the same length as x' )
250
251        # Drop NaN values
252        if dropna:
253            isna = np.isnan(x*y*w)
254            x = x[~isna]
255            y = y[~isna]
256            w = w[~isna]
257
258        diff = y - x
259        absdiff = np.abs( y - x )
260        # Ignore divide by zero and 0/0 while dividing
261        old_settings = np.seterr(divide='ignore',invalid='ignore')
262        ratio = y/x
263        np.seterr(**old_settings)
264
265        # Means, medians, and standard deviations
266        self.xmean = np.mean(x)
267        self.ymean = np.mean(y)
268        self.xmedian = np.median(x)
269        self.ymedian = np.median(y)
270        self.xstd   = np.std(x)
271        self.ystd   = np.std(y)
272
273        # Save values for use later
274        self._x = x
275        self._y = y
276        self._w = w
277
278        # Mean and mean absolute differences
279        self.mean_difference            = self.md   = self.ymean - self.xmean
280        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
281
282        # Relative and standardized differences
283        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
284        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
285        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
286        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
287
288        # Mean and median relative differences
289        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
290        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
291
292        # Median and median absolute differences
293        self.median_difference          = self.medd  = np.median( diff )
294        self.median_absolute_difference = self.medad = np.median( absdiff )
295
296        # Relative median differences
297        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
298        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
299
300        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
301        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
302
303        # RMS difference
304        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
305
306        # Covariance, correlation
307        self.covariance = np.cov(x,y)[0][1]
308        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
309            np.corrcoef(x,y)[0][1]
310        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
311        self.R2 = self.r2 = self.R**2
312
313    def __getitem__(self,key):
314        '''Accesses attribute values via object['key']'''
315        return getattr(self,key)
316
317    def fitline(self,method='sma',intercept=True,**kwargs):
318        '''Compute bivariate line fit
319        
320        Parameters
321        ----------
322        method : str
323            line fitting method: sma (default), ols, wls, York, sen, siegel
324        intercept : bool
325            defines whether non-zero intercept should be fitted
326        **kwargs 
327            passed to `acgc.stats.sma` (e.g. robust=True)
328
329        Returns
330        -------
331        result : dict
332            dictionary with keys:
333            - slope (float)
334                slope of fitted line
335            - intercept (float)
336                intercept of fitted line
337            - fittedvalues (array (N,))
338                values on fit line
339            - residuals (array (N,))
340                residual from fit line
341        '''
342
343        fitintercept = intercept
344
345        if method.lower()=='sma':
346            fit = sma(  self._x,
347                        self._y,
348                        self._w,
349                        intercept=fitintercept,
350                        **kwargs)
351            slope = fit['slope']
352            intercept= fit['intercept']
353
354        elif method.lower()=='ols':
355            if fitintercept:
356                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
357                                      self._y, rcond=None )
358            else:
359                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
360            slope = ols[0][0]
361            intercept = ols[0][1]
362
363        elif method.lower() in ['theil','sen','theilsen']:
364            fitintercept = True
365            fit = sen( self._x,
366                       self._y,
367                       **kwargs)
368            slope = fit.slope
369            intercept = fit.intercept
370
371        elif method.lower()=='siegel':
372            fitintercept = True
373            siegel = stats.siegelslopes( self._x,
374                                         self._y )
375            slope = siegel.slope
376            intercept = siegel.intercept
377
378        elif method.lower()=='wls':
379            raise NotImplementedError('WLS regression not implemented yet')
380
381        elif method.lower()=='york':
382            raise NotImplementedError('York regression not implemented yet')
383
384        else:
385            raise ValueError('Undefined method '+method)
386
387        line = dict( slope          = slope,
388                     intercept      = intercept,
389                     fittedvalues   = slope * self._x + intercept,
390                     residuals      = self._y - ( slope * self._x + intercept ),
391                     method         = method,
392                     fitintercept   = fitintercept )
393
394        return line
395
396    def slope(self,method='sma',intercept=True,**kwargs):
397        '''Compute slope of bivariate line fit
398        
399        Parameters
400        ----------
401        method : str
402            line fitting method: sma (default), ols, wls
403        intercept : bool
404            defines whether non-zero intercept should be fitted
405        **kwargs 
406            passed to `fitline`
407
408        Returns
409        -------
410        slope : float
411            value of y intercept
412        '''
413        return self.fitline(method,intercept,**kwargs)['slope']
414
415    def intercept(self,method='sma',intercept=True,**kwargs):
416        '''Compute intercept of bivariate line fit
417        
418        Parameters
419        ----------
420        method : str
421            line fitting method: sma (default) or ols
422        intercept : bool
423            defines whether non-zero intercept should be fitted
424        **kwargs 
425            passed to `fitline`
426
427        Returns
428        -------
429        intercept : float
430            value of y intercept
431        '''
432        return self.fitline(method,intercept,**kwargs)['intercept']
433
434    def _expand_variables(self,variables):
435        '''Expand special strings into a list of variables
436        
437        Parameter
438        ---------
439        variables : list or str, default='common'
440            Special strings ("all","common") will be expanded to a list of variables
441            list arguments will not be modified
442
443        Returns
444        -------
445        list 
446            variable names
447        '''
448        if variables is None:
449            variables='common'
450        if variables=='all':
451            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
452                       'MedD','MedAD','RMedD','RMedAD','MedRD',
453                       'NMBF','NMAEF','RMSD',
454                       'R','R2','spearmanr','slope','intercept',
455                       'fitline']
456        elif variables=='common':
457            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope']
458        if not isinstance(variables,list):
459            raise ValueError(
460                'variables must be a list, None, or one of these strings: "all","common"')
461
462        return variables
463
464    def summary_dict(self, variables=None,
465                     fitline_kw=None,
466                     floatformat_fiteqn='{:.3f}' ):
467        '''Summarize bivariate statistics into a dict
468
469        Parameters
470        ----------
471        vars : list or str, default='common'
472            names of attribute variables to include in summary
473            names are case insensitive            
474            The following strings are also accepted in place of a list 
475                "all" (displays all variables)
476                "common" (displays all measures of mean difference)
477        fitline_kw : dict, default=None)
478            keywords passed to self.fitline()
479        
480        Returns
481        -------
482        summary : dict
483            names and values of variables
484        '''
485
486        # List of variables
487        variables = self._expand_variables(variables)
488
489        if fitline_kw is None:
490            fitline_kw = {'method':'sma',
491                          'intercept':True}
492
493        # Construct the dict
494        summary = {}
495        for v in variables:
496            if v in ['slope','intercept']:
497                # These variables are object methods
498                func = getattr(self,v)
499                value = func(**fitline_kw)
500            elif v == 'fitline':
501                line = self.fitline(**fitline_kw)
502                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
503            else:
504                # Retrieve values
505                value = getattr(self,v.lower())
506
507            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
508            summary[v] = value
509
510        return summary
511
512    def summary(self, variables=None, fitline_kw=None,
513                floatformat='{:.4f}', floatformat_fiteqn=None,
514                stringlength=None ):
515        '''Summarize bivariate statistics
516
517        Parameters
518        ----------
519        vars : list or str, default='common'
520            names of attribute variables to include in summary
521            names are case insensitive            
522            The following strings are also accepted in place of a list 
523                "all" (displays all variables)
524                "common" (displays all measures of mean difference)
525        floatformat : str, default='{:.4f}'
526            format specifier for floating point values
527        floatformat_fiteqn : str, default=floatformat
528            format specifier for slope and intercept (a,b) in y = a x + b
529        stringlength : int, default=None
530            length of the variables on output
531            default (None) is to use the length of the longest variable name
532        fitline_kw : dict, default=None
533            keywords passed to `fitline`
534        
535        Returns
536        -------
537        summary : str
538            names and values of variables
539        '''
540        # List of variables
541        variables = self._expand_variables(variables)
542
543        if floatformat_fiteqn is None:
544            floatformat_fiteqn = floatformat
545        if stringlength is None:
546            stringlength = np.max([len(v) for v in variables])
547        stringformat = '{:'+str(stringlength)+'s}'
548
549        # Get a dict containing the needed variables
550        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
551
552        # Extract length of the float numbers from floatformat
553        # import re
554        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
555        #       floatformat )[0] ) ).astype(int)
556
557        # summary = (stringformat+'{:>10s}').format('Variable','Value')
558        summarytext = ''
559        for k,v in summarydict.items():
560            if isinstance(v,str):
561                summarytext += (stringformat+' = {:s}\n').format(k,v)
562            else:
563                summarytext += (stringformat+' = '+floatformat+'\n').format(k,v)
564
565        return summarytext
566
567    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
568                          floatformat='{:.3f}', floatformat_fiteqn=None,
569                          loc=None, loc_units='axes',
570                          **kwargs):
571        '''Display bivariate statistics as a table inset on a plot axis
572
573        Parameters
574        ----------
575        ax : matplotlib.Figure.Axis 
576            axis where the table will be displayed
577        variables : list or str, default='common'
578            names of attribute variables to include in summary
579            names are case insensitive            
580            The following strings are also accepted in place of a list 
581                "all" (displays all variables)
582                "common" (displays all measures of mean difference)
583        fitline_kw : dict, default=None
584            keywords passed to `fitline`
585        floatformat : str, default='{:.3f}'
586            format specifier for floating point values
587        floatformat_fiteqn : str, default=floatformat
588            format specifier for slope and intercept (a,b) in y = a x + b
589        loc : tuple (x0,y0), default=(0.85, 0.05)
590            location on the axis where the table will be drawn
591            can be in data units or axes units [0-1]
592        loc_units : {'axes' (default), 'data'}
593            specifies whether loc has 'data' units or 'axes' units [0-1]
594                    
595        Returns
596        -------
597        text1, text2 : matplotlib text object
598            Artist for the two text boxes        
599        '''
600        # List of variables
601        variables = self._expand_variables(variables)
602
603        if floatformat_fiteqn is None:
604            floatformat_fiteqn = floatformat
605
606        # Default location in lower right corner
607        if loc is None:
608            loc = (0.8,0.05)
609
610        # Coordinates for loc
611        if loc_units.lower()=='data':
612            coord=ax.transData
613        elif loc_units.lower() in ['axes','axis']:
614            coord=ax.transAxes
615        else:
616            raise ValueError('Display units should be "Data" or "Axes"')
617
618        # Get a dict containing the needed variables
619        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
620
621        # Column of label text
622        label_text = '\n'.join([_texify_name(key) for key in summarydict])
623        # Column of value text
624        value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value)
625                                for value in summarydict.values()])
626
627        # Check if horizontal alignment keyword is used
628        ha=''
629        try:
630            ha = kwargs['ha']
631        except KeyError:
632            pass
633        try:
634            ha = kwargs['horizontalalignment']
635        except KeyError:
636            pass
637
638        # For right alignment, align on values first
639        # Otherwise, align on labels
640        if ha=='right':
641            first_text = value_text
642            second_text = label_text
643            sign = -1
644        else:
645            first_text = label_text
646            second_text = value_text
647            sign = +1
648
649        # Add first column of text
650        t1=ax.text(loc[0],loc[1],
651                first_text,
652                transform=coord,
653                **kwargs
654                )
655
656        # Get width of first text column
657        bbox = t1.get_window_extent().transformed(coord.inverted())
658        width = bbox.x1-bbox.x0
659
660        # Add second column of text
661        t2 = ax.text(loc[0]+width*sign,loc[1],
662                     second_text,
663                     transform=coord,
664                     **kwargs
665                     )
666
667        ##################################
668        # Early version of this function using matplotlib.table.table()
669
670        # if isinstance(loc,(tuple,list)):
671        #     # Create an inset axis to contain the table
672        #     tableaxis = ax.inset_axes(loc)
673        #     table_width=1
674        # else:
675        #     tableaxis = ax
676
677        # # Display the table on the axis
678        # return mtable.table(
679        #     tableaxis,
680        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
681        #     rowLabels=[texify_name(key) for key in summarydict],
682        #     colWidths=[table_width/2]*2,
683        #     edges=edges,
684        #     loc=loc, bbox=bbox
685        #     )
686
687        return [t1,t2]

A suite of common statistics to quantify bivariate relationships

Class method 'summary' provides a formatted summary of these statistics

Attributes
  • xmean, ymean (float): mean of x and y variables
  • xmedian, ymedian (float): median of x and y variables
  • xstd, ystd (float): standard deviation of x and y variables
  • mean_difference, md (float): ymean - xmean
  • mean_absolute_difference, mad (float): mean( |y-x| )
  • relative_mean_difference, rmd (float): md / xmean
  • relative_mean_absolute_difference, rmad (float): mad / xmean
  • standardized_mean_difference, smd (float): md / xstd
  • standardized_mean_absolute_difference, smad (float): mad /xstd
  • mean_relative_difference, mrd (float): mean(y/x) - 1
  • median_difference, medd (float): median(y-x)
  • median_absolute_difference, medad (float): median(|y-x|)
  • relative_median_difference, rmedd (float): median(y-x) / xmedian
  • relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
  • median_relative_difference, medianrd, medrd (float): median(y/x)-1
  • normalized_mean_bias_factor, nmbf (float): see nmbf
  • normalized_mean_absolute_error_factor, nmaef (float): see nmaef
  • root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
  • covariance (float): cov(x,y)
  • correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
  • correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
  • R2, r2 (float): Linear coefficient of determination, $R^2$
BivariateStatistics(x, y, w=None, dropna=False, data=None)
213    def __init__(self,x,y,w=None,dropna=False,data=None):
214        '''Compute suite of bivariate statistics during initialization
215        
216        Statistic values are saved in attributes.
217        CAUTION: Weights w are ignored except in SMA fit
218
219        Parameters
220        ----------
221        x : ndarray or str
222            independent variable values
223        y : ndarray or str
224            dependent variable values, same size as x
225        w : ndarray or str, optional
226            weights for points (x,y), same size as x and y
227        dropna : bool, optional (default=False)
228            drops NaN values from x, y, and w
229        data : dict-like, optional
230            if x, y, or w are str, then they should be keys in data
231        '''
232
233        # Get values from data if needed
234        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
235            raise ValueError( 'Data argument must be used if x, y, or w is a string')
236        if isinstance(x,str):
237            x = data[x]
238        if isinstance(y,str):
239            y = data[y]
240        if isinstance(w,str):
241            w = data[w]
242
243        #Ensure that x and y have same length
244        if len(x) != len(y):
245            raise ValueError( 'Arguments x and y must have the same length' )
246        if w is None:
247            w = np.ones_like(x)
248        if len(w) != len(x):
249            raise ValueError( 'Argument w (if present) must have the same length as x' )
250
251        # Drop NaN values
252        if dropna:
253            isna = np.isnan(x*y*w)
254            x = x[~isna]
255            y = y[~isna]
256            w = w[~isna]
257
258        diff = y - x
259        absdiff = np.abs( y - x )
260        # Ignore divide by zero and 0/0 while dividing
261        old_settings = np.seterr(divide='ignore',invalid='ignore')
262        ratio = y/x
263        np.seterr(**old_settings)
264
265        # Means, medians, and standard deviations
266        self.xmean = np.mean(x)
267        self.ymean = np.mean(y)
268        self.xmedian = np.median(x)
269        self.ymedian = np.median(y)
270        self.xstd   = np.std(x)
271        self.ystd   = np.std(y)
272
273        # Save values for use later
274        self._x = x
275        self._y = y
276        self._w = w
277
278        # Mean and mean absolute differences
279        self.mean_difference            = self.md   = self.ymean - self.xmean
280        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
281
282        # Relative and standardized differences
283        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
284        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
285        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
286        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
287
288        # Mean and median relative differences
289        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
290        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
291
292        # Median and median absolute differences
293        self.median_difference          = self.medd  = np.median( diff )
294        self.median_absolute_difference = self.medad = np.median( absdiff )
295
296        # Relative median differences
297        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
298        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
299
300        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
301        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
302
303        # RMS difference
304        self.root_mean_square_difference    = self.rmsd     = np.sqrt( np.mean( np.power( diff, 2) ) )
305
306        # Covariance, correlation
307        self.covariance = np.cov(x,y)[0][1]
308        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
309            np.corrcoef(x,y)[0][1]
310        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
311        self.R2 = self.r2 = self.R**2

Compute suite of bivariate statistics during initialization

Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit

Parameters
  • x (ndarray or str): independent variable values
  • y (ndarray or str): dependent variable values, same size as x
  • w (ndarray or str, optional): weights for points (x,y), same size as x and y
  • dropna (bool, optional (default=False)): drops NaN values from x, y, and w
  • data (dict-like, optional): if x, y, or w are str, then they should be keys in data
xmean
ymean
xmedian
ymedian
xstd
ystd
covariance
def fitline(self, method='sma', intercept=True, **kwargs):
317    def fitline(self,method='sma',intercept=True,**kwargs):
318        '''Compute bivariate line fit
319        
320        Parameters
321        ----------
322        method : str
323            line fitting method: sma (default), ols, wls, York, sen, siegel
324        intercept : bool
325            defines whether non-zero intercept should be fitted
326        **kwargs 
327            passed to `acgc.stats.sma` (e.g. robust=True)
328
329        Returns
330        -------
331        result : dict
332            dictionary with keys:
333            - slope (float)
334                slope of fitted line
335            - intercept (float)
336                intercept of fitted line
337            - fittedvalues (array (N,))
338                values on fit line
339            - residuals (array (N,))
340                residual from fit line
341        '''
342
343        fitintercept = intercept
344
345        if method.lower()=='sma':
346            fit = sma(  self._x,
347                        self._y,
348                        self._w,
349                        intercept=fitintercept,
350                        **kwargs)
351            slope = fit['slope']
352            intercept= fit['intercept']
353
354        elif method.lower()=='ols':
355            if fitintercept:
356                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
357                                      self._y, rcond=None )
358            else:
359                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
360            slope = ols[0][0]
361            intercept = ols[0][1]
362
363        elif method.lower() in ['theil','sen','theilsen']:
364            fitintercept = True
365            fit = sen( self._x,
366                       self._y,
367                       **kwargs)
368            slope = fit.slope
369            intercept = fit.intercept
370
371        elif method.lower()=='siegel':
372            fitintercept = True
373            siegel = stats.siegelslopes( self._x,
374                                         self._y )
375            slope = siegel.slope
376            intercept = siegel.intercept
377
378        elif method.lower()=='wls':
379            raise NotImplementedError('WLS regression not implemented yet')
380
381        elif method.lower()=='york':
382            raise NotImplementedError('York regression not implemented yet')
383
384        else:
385            raise ValueError('Undefined method '+method)
386
387        line = dict( slope          = slope,
388                     intercept      = intercept,
389                     fittedvalues   = slope * self._x + intercept,
390                     residuals      = self._y - ( slope * self._x + intercept ),
391                     method         = method,
392                     fitintercept   = fitintercept )
393
394        return line

Compute bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to acgc.stats.sma (e.g. robust=True)
Returns
  • result (dict): dictionary with keys:
    • slope (float) slope of fitted line
    • intercept (float) intercept of fitted line
    • fittedvalues (array (N,)) values on fit line
    • residuals (array (N,)) residual from fit line
def slope(self, method='sma', intercept=True, **kwargs):
396    def slope(self,method='sma',intercept=True,**kwargs):
397        '''Compute slope of bivariate line fit
398        
399        Parameters
400        ----------
401        method : str
402            line fitting method: sma (default), ols, wls
403        intercept : bool
404            defines whether non-zero intercept should be fitted
405        **kwargs 
406            passed to `fitline`
407
408        Returns
409        -------
410        slope : float
411            value of y intercept
412        '''
413        return self.fitline(method,intercept,**kwargs)['slope']

Compute slope of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • slope (float): value of y intercept
def intercept(self, method='sma', intercept=True, **kwargs):
415    def intercept(self,method='sma',intercept=True,**kwargs):
416        '''Compute intercept of bivariate line fit
417        
418        Parameters
419        ----------
420        method : str
421            line fitting method: sma (default) or ols
422        intercept : bool
423            defines whether non-zero intercept should be fitted
424        **kwargs 
425            passed to `fitline`
426
427        Returns
428        -------
429        intercept : float
430            value of y intercept
431        '''
432        return self.fitline(method,intercept,**kwargs)['intercept']

Compute intercept of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default) or ols
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • intercept (float): value of y intercept
def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
464    def summary_dict(self, variables=None,
465                     fitline_kw=None,
466                     floatformat_fiteqn='{:.3f}' ):
467        '''Summarize bivariate statistics into a dict
468
469        Parameters
470        ----------
471        vars : list or str, default='common'
472            names of attribute variables to include in summary
473            names are case insensitive            
474            The following strings are also accepted in place of a list 
475                "all" (displays all variables)
476                "common" (displays all measures of mean difference)
477        fitline_kw : dict, default=None)
478            keywords passed to self.fitline()
479        
480        Returns
481        -------
482        summary : dict
483            names and values of variables
484        '''
485
486        # List of variables
487        variables = self._expand_variables(variables)
488
489        if fitline_kw is None:
490            fitline_kw = {'method':'sma',
491                          'intercept':True}
492
493        # Construct the dict
494        summary = {}
495        for v in variables:
496            if v in ['slope','intercept']:
497                # These variables are object methods
498                func = getattr(self,v)
499                value = func(**fitline_kw)
500            elif v == 'fitline':
501                line = self.fitline(**fitline_kw)
502                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
503            else:
504                # Retrieve values
505                value = getattr(self,v.lower())
506
507            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
508            summary[v] = value
509
510        return summary

Summarize bivariate statistics into a dict

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None)): keywords passed to self.fitline()
Returns
  • summary (dict): names and values of variables
def summary( self, variables=None, fitline_kw=None, floatformat='{:.4f}', floatformat_fiteqn=None, stringlength=None):
512    def summary(self, variables=None, fitline_kw=None,
513                floatformat='{:.4f}', floatformat_fiteqn=None,
514                stringlength=None ):
515        '''Summarize bivariate statistics
516
517        Parameters
518        ----------
519        vars : list or str, default='common'
520            names of attribute variables to include in summary
521            names are case insensitive            
522            The following strings are also accepted in place of a list 
523                "all" (displays all variables)
524                "common" (displays all measures of mean difference)
525        floatformat : str, default='{:.4f}'
526            format specifier for floating point values
527        floatformat_fiteqn : str, default=floatformat
528            format specifier for slope and intercept (a,b) in y = a x + b
529        stringlength : int, default=None
530            length of the variables on output
531            default (None) is to use the length of the longest variable name
532        fitline_kw : dict, default=None
533            keywords passed to `fitline`
534        
535        Returns
536        -------
537        summary : str
538            names and values of variables
539        '''
540        # List of variables
541        variables = self._expand_variables(variables)
542
543        if floatformat_fiteqn is None:
544            floatformat_fiteqn = floatformat
545        if stringlength is None:
546            stringlength = np.max([len(v) for v in variables])
547        stringformat = '{:'+str(stringlength)+'s}'
548
549        # Get a dict containing the needed variables
550        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
551
552        # Extract length of the float numbers from floatformat
553        # import re
554        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
555        #       floatformat )[0] ) ).astype(int)
556
557        # summary = (stringformat+'{:>10s}').format('Variable','Value')
558        summarytext = ''
559        for k,v in summarydict.items():
560            if isinstance(v,str):
561                summarytext += (stringformat+' = {:s}\n').format(k,v)
562            else:
563                summarytext += (stringformat+' = '+floatformat+'\n').format(k,v)
564
565        return summarytext

Summarize bivariate statistics

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • floatformat : str, default='{ (.4f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
  • fitline_kw (dict, default=None): keywords passed to fitline
Returns
  • summary (str): names and values of variables
def summary_fig_inset( self, ax, variables=None, fitline_kw=None, floatformat='{:.3f}', floatformat_fiteqn=None, loc=None, loc_units='axes', **kwargs):
567    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
568                          floatformat='{:.3f}', floatformat_fiteqn=None,
569                          loc=None, loc_units='axes',
570                          **kwargs):
571        '''Display bivariate statistics as a table inset on a plot axis
572
573        Parameters
574        ----------
575        ax : matplotlib.Figure.Axis 
576            axis where the table will be displayed
577        variables : list or str, default='common'
578            names of attribute variables to include in summary
579            names are case insensitive            
580            The following strings are also accepted in place of a list 
581                "all" (displays all variables)
582                "common" (displays all measures of mean difference)
583        fitline_kw : dict, default=None
584            keywords passed to `fitline`
585        floatformat : str, default='{:.3f}'
586            format specifier for floating point values
587        floatformat_fiteqn : str, default=floatformat
588            format specifier for slope and intercept (a,b) in y = a x + b
589        loc : tuple (x0,y0), default=(0.85, 0.05)
590            location on the axis where the table will be drawn
591            can be in data units or axes units [0-1]
592        loc_units : {'axes' (default), 'data'}
593            specifies whether loc has 'data' units or 'axes' units [0-1]
594                    
595        Returns
596        -------
597        text1, text2 : matplotlib text object
598            Artist for the two text boxes        
599        '''
600        # List of variables
601        variables = self._expand_variables(variables)
602
603        if floatformat_fiteqn is None:
604            floatformat_fiteqn = floatformat
605
606        # Default location in lower right corner
607        if loc is None:
608            loc = (0.8,0.05)
609
610        # Coordinates for loc
611        if loc_units.lower()=='data':
612            coord=ax.transData
613        elif loc_units.lower() in ['axes','axis']:
614            coord=ax.transAxes
615        else:
616            raise ValueError('Display units should be "Data" or "Axes"')
617
618        # Get a dict containing the needed variables
619        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
620
621        # Column of label text
622        label_text = '\n'.join([_texify_name(key) for key in summarydict])
623        # Column of value text
624        value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value)
625                                for value in summarydict.values()])
626
627        # Check if horizontal alignment keyword is used
628        ha=''
629        try:
630            ha = kwargs['ha']
631        except KeyError:
632            pass
633        try:
634            ha = kwargs['horizontalalignment']
635        except KeyError:
636            pass
637
638        # For right alignment, align on values first
639        # Otherwise, align on labels
640        if ha=='right':
641            first_text = value_text
642            second_text = label_text
643            sign = -1
644        else:
645            first_text = label_text
646            second_text = value_text
647            sign = +1
648
649        # Add first column of text
650        t1=ax.text(loc[0],loc[1],
651                first_text,
652                transform=coord,
653                **kwargs
654                )
655
656        # Get width of first text column
657        bbox = t1.get_window_extent().transformed(coord.inverted())
658        width = bbox.x1-bbox.x0
659
660        # Add second column of text
661        t2 = ax.text(loc[0]+width*sign,loc[1],
662                     second_text,
663                     transform=coord,
664                     **kwargs
665                     )
666
667        ##################################
668        # Early version of this function using matplotlib.table.table()
669
670        # if isinstance(loc,(tuple,list)):
671        #     # Create an inset axis to contain the table
672        #     tableaxis = ax.inset_axes(loc)
673        #     table_width=1
674        # else:
675        #     tableaxis = ax
676
677        # # Display the table on the axis
678        # return mtable.table(
679        #     tableaxis,
680        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
681        #     rowLabels=[texify_name(key) for key in summarydict],
682        #     colWidths=[table_width/2]*2,
683        #     edges=edges,
684        #     loc=loc, bbox=bbox
685        #     )
686
687        return [t1,t2]

Display bivariate statistics as a table inset on a plot axis

Parameters
  • ax (matplotlib.Figure.Axis): axis where the table will be displayed
  • variables (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • floatformat : str, default='{ (.3f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
  • loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
  • text1, text2 (matplotlib text object): Artist for the two text boxes
def nmb(x0, x1):
22def nmb( x0, x1 ):
23    '''Compute Normalized Mean Bias (NMB)
24
25    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
26
27    Parameters
28    ----------
29    x0 : array_like
30        reference values
31    x1 : array_like
32        experiment values
33    '''
34
35    assert (len(x0) == len(x1)), \
36        "Parameters x0 and x1 must have the same length"
37
38    # Mean values
39    x0_mean = np.mean(x0)
40    x1_mean = np.mean(x1)
41
42    # Metric value
43    return x1_mean / x0_mean - 1

Compute Normalized Mean Bias (NMB)

NMB = ( mean(x1) - mean(x0) ) / mean(x0)

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmae(x0, x1):
45def nmae( x0, x1 ):
46    '''Compute Normalized Mean Absolute Error (NMAE)
47
48    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
49
50    Parameters
51    ---------
52    x0 : array_like
53        reference values
54    x1 : array_like
55        experiment values
56    '''
57
58     # Mean values
59    x0_mean = np.mean(x0)
60
61    # Mean absolute difference
62    abs_diff = np.mean( np.abs(x1 - x0) )
63
64    # Metric value
65    return abs_diff / np.abs( x0_mean )

Compute Normalized Mean Absolute Error (NMAE)

NMAE = mean(abs(x1 - x0)) / abs(mean(x0))

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmbf(x0, x1):
68def nmbf( x0, x1 ):
69    '''Compute Normalized Mean Bias Factor (NMBF)
70
71    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
72
73    Parameters
74    ----------
75    x0 : array_like
76        reference values
77    x1 : array_like
78        experiment values
79    '''
80
81    # Ensure that arguments have the same length
82    assert (len(x0) == len(x1)), \
83        "Parameters x0 and x1 must have the same length"
84
85    # Mean values
86    x0_mean = np.mean(x0)
87    x1_mean = np.mean(x1)
88
89    # Metric value
90    if x1_mean >= x0_mean:
91        result = x1_mean / x0_mean - 1
92    else:
93        result= 1 - x0_mean / x1_mean
94    # Equivalent (faster?) implementation
95    #S = (mMean - oMean) / np.abs(mMean - oMean)
96    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
97
98    return result

Compute Normalized Mean Bias Factor (NMBF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmaef(x0, x1):
100def nmaef( x0, x1 ):
101    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
102
103    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
104    
105    Parameters
106    ----------
107    x0 : array_like
108        reference values
109    x1 : array_like
110        experiment values
111    '''
112
113    # Ensure that arguments have the same length
114    assert (len(x0) == len(x1)), \
115        "Parameters x0 and x1 must have the same length"
116
117    # Mean values
118    x0_mean = np.mean(x0)
119    x1_mean = np.mean(x1)
120
121    # Mean absolute difference
122    abs_diff = np.mean( np.abs(x1 - x0))
123
124    # Metric value
125    if x1_mean >= x0_mean:
126        result = abs_diff / x0_mean 
127    else:
128        result = abs_diff / x1_mean
129    # Equivalent (faster?) implementation
130    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
131    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
132
133    return result

Compute Normalized Mean Absolute Error Factor (NMAEF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values