acgc.stats.bivariate

Bivariate statistics

Statistical measures of relationships between two populations

  1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3""" Bivariate statistics
  4
  5Statistical measures of relationships between two populations
  6"""
  7
  8import numpy as np
  9from scipy import stats
 10from .bivariate_lines import sen, sma, bivariate_line_equation
 11# import xarray as xr
 12
 13__all__ = [
 14    "BivariateStatistics",
 15    "nmb",
 16    "nmae",
 17    "nmbf",
 18    "nmaef"
 19]
 20
 21def nmb( x0, x1 ):
 22    '''Compute Normalized Mean Bias (NMB)
 23
 24    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
 25
 26    Parameters
 27    ----------
 28    x0 : array_like
 29        reference values
 30    x1 : array_like
 31        experiment values
 32    '''
 33
 34    assert (len(x0) == len(x1)), \
 35        "Parameters x0 and x1 must have the same length"
 36
 37    # Mean values
 38    x0_mean = np.mean(x0)
 39    x1_mean = np.mean(x1)
 40
 41    # Metric value
 42    return x1_mean / x0_mean - 1
 43
 44def nmae( x0, x1 ):
 45    '''Compute Normalized Mean Absolute Error (NMAE)
 46
 47    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
 48
 49    Parameters
 50    ---------
 51    x0 : array_like
 52        reference values
 53    x1 : array_like
 54        experiment values
 55    '''
 56
 57     # Mean values
 58    x0_mean = np.mean(x0)
 59
 60    # Mean absolute difference
 61    abs_diff = np.mean( np.abs(x1 - x0) )
 62
 63    # Metric value
 64    return abs_diff / np.abs( x0_mean )
 65
 66
 67def nmbf( x0, x1 ):
 68    '''Compute Normalized Mean Bias Factor (NMBF)
 69
 70    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
 71
 72    Parameters
 73    ----------
 74    x0 : array_like
 75        reference values
 76    x1 : array_like
 77        experiment values
 78    '''
 79
 80    # Ensure that arguments have the same length
 81    assert (len(x0) == len(x1)), \
 82        "Parameters x0 and x1 must have the same length"
 83
 84    # Mean values
 85    x0_mean = np.mean(x0)
 86    x1_mean = np.mean(x1)
 87
 88    # Metric value
 89    if x1_mean >= x0_mean:
 90        result = x1_mean / x0_mean - 1
 91    else:
 92        result= 1 - x0_mean / x1_mean
 93    # Equivalent (faster?) implementation
 94    #S = (mMean - oMean) / np.abs(mMean - oMean)
 95    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
 96
 97    return result
 98
 99def nmaef( x0, x1 ):
100    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
101
102    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
103    
104    Parameters
105    ----------
106    x0 : array_like
107        reference values
108    x1 : array_like
109        experiment values
110    '''
111
112    # Ensure that arguments have the same length
113    assert (len(x0) == len(x1)), \
114        "Parameters x0 and x1 must have the same length"
115
116    # Mean values
117    x0_mean = np.mean(x0)
118    x1_mean = np.mean(x1)
119
120    # Mean absolute difference
121    abs_diff = np.mean( np.abs(x1 - x0))
122
123    # Metric value
124    if x1_mean >= x0_mean:
125        result = abs_diff / x0_mean 
126    else:
127        result = abs_diff / x1_mean
128    # Equivalent (faster?) implementation
129    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
130    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
131
132    return result
133
134def _texify_name(name):
135    '''Return a LaTex formatted string for some variables
136    
137    Parameters
138    ----------
139    name : str
140    
141    Returns
142    -------
143    pretty_name : str
144    '''
145    if name.lower()=='n':
146        pretty_name = r'$n$'
147    elif name=='R2':
148        pretty_name = f'$R^2$'
149    elif name=='r2':
150        pretty_name = f'$r^2$'
151    elif name.lower()=='y_ols':
152        pretty_name = r'$y_{\rm OLS}$'
153    elif name.lower()=='y_sma':
154        pretty_name = r'$y_{\rm SMA}$'
155    elif name.lower()=='y_sen':
156        pretty_name = r'$y_{\rm Sen}$'
157    else:
158        pretty_name = name
159    return pretty_name
160
161def _number2str(value,
162                intformat='{:d}',
163                floatformat='{:.4f}'):
164    '''Format number as string using integer and float format specifiers
165    
166    Parameters
167    ----------
168    value : numeric, str
169        value to be converted
170    intformat : str, default='{:d}'
171        format specifier for integer types
172    floatformat : str, default='{:.4f}'
173        format specifier for float types
174
175    Returns
176    -------
177    str
178    '''
179    if isinstance(value,str):
180        pass
181    elif isinstance(value,(int,np.integer)):
182        value = intformat.format(value)
183    else:
184        value = floatformat.format(value)
185    return value
186
187class BivariateStatistics:
188    '''A suite of common statistics to quantify bivariate relationships
189
190    Class method 'summary' provides a formatted summary of these statistics
191    
192    Attributes
193    ----------
194    count, n : int
195        number of valid (not NaN) data value pairs
196    xmean, ymean : float
197        mean of x and y variables
198    xmedian, ymedian :float
199        median of x and y variables
200    xstd, ystd : float
201        standard deviation of x and y variables
202    mean_difference, md : float
203        ymean - xmean
204    std_difference, stdd : float
205        std( y - x )
206    mean_absolute_difference, mad : float
207        mean( |y-x| )
208    relative_mean_difference, rmd : float
209        md / xmean
210    relative_mean_absolute_difference, rmad :float
211        mad / xmean
212    standardized_mean_difference, smd : float
213        md / xstd
214    standardized_mean_absolute_difference, smad : float
215        mad /xstd
216    mean_relative_difference, mrd : float
217        mean(y/x) - 1
218    mean_log10_ratio, mlr : float
219        mean( log10(y/x) )
220    std_log10_ratio, stdlr : float
221        std( log10(y/x) )
222    mean_absolute_log10_ratio, malr : float
223        mean( abs( log10(y/x) ) )
224    median_difference, medd : float
225        median(y-x)
226    median_absolute_difference, medad : float
227        median(|y-x|)
228    relative_median_difference, rmedd : float
229        median(y-x) / xmedian
230    relative_median_absolute_difference, rmedad : float
231        median(|y-x|) / xmedian
232    median_relative_difference, medianrd, medrd : float
233        median(y/x)-1
234    median_log10_ratio, medlr : float
235        median( log10(y/x) )
236    median_absolute_log10_ratio, medalr : float
237        median( abs( log10(y/x) ) )
238    normalized_mean_bias_factor, nmbf : float
239        see `nmbf` 
240    normalized_mean_absolute_error_factor, nmaef : float
241        see `nmaef`
242    root_mean_square_difference, rmsd : float
243        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
244    root_mean_square_log10_ratio, rmslr : float
245        $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$
246    covariance : float
247        cov(x,y)
248    correlation_pearson, correlation, pearsonr, R, r : float
249        Pearson linear correlation coefficient 
250    correlation_spearman, spearmanr : float
251        Spearman, non-parametric rank correlation coefficient
252    R2, r2 : float
253        Linear coefficient of determination, $R^2$
254    '''
255
256    def __init__(self,x,y,w=None,dropna=False,data=None):
257        '''Compute suite of bivariate statistics during initialization
258        
259        Statistic values are saved in attributes.
260        CAUTION: Weights w are ignored except in SMA fit
261
262        Parameters
263        ----------
264        x : ndarray or str
265            independent variable values
266        y : ndarray or str
267            dependent variable values, same size as x
268        w : ndarray or str, optional
269            weights for points (x,y), same size as x and y
270        dropna : bool, optional (default=False)
271            drops NaN values from x, y, and w
272        data : dict-like, optional
273            if x, y, or w are str, then they should be keys in data
274        '''
275
276        # Get values from data if needed
277        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
278            raise ValueError( 'Data argument must be used if x, y, or w is a string')
279        if isinstance(x,str):
280            x = data[x]
281        if isinstance(y,str):
282            y = data[y]
283        if isinstance(w,str):
284            w = data[w]
285
286        #Ensure that x and y have same length
287        if len(x) != len(y):
288            raise ValueError( 'Arguments x and y must have the same length' )
289        if w is None:
290            w = np.ones_like(x)
291        if len(w) != len(x):
292            raise ValueError( 'Argument w (if present) must have the same length as x' )
293
294        # Drop NaN values
295        if dropna:
296            isna = np.isnan(x*y*w)
297            x = x[~isna]
298            y = y[~isna]
299            w = w[~isna]
300
301        # Differences and ratios used repeatedly
302        diff = y - x
303        absdiff = np.abs( y - x )
304        # Ignore divide by zero and 0/0 while dividing
305        old_settings = np.seterr(divide='ignore',invalid='ignore')
306        ratio = y/x
307        log10ratio = np.log10(ratio)
308        np.seterr(**old_settings)
309
310        # Number of data points
311        self.count = self.n = len(x)
312
313        # Means, medians, and standard deviations
314        self.xmean = np.mean(x)
315        self.ymean = np.mean(y)
316        self.xmedian = np.median(x)
317        self.ymedian = np.median(y)
318        self.xstd   = np.std(x)
319        self.ystd   = np.std(y)
320
321        # Save values for use later
322        self._x = x
323        self._y = y
324        self._w = w
325
326        # Mean and mean absolute differences
327        self.mean_difference            = self.md   = self.ymean - self.xmean
328        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
329        self.std_difference             = self.stdd = np.std( diff )
330
331        # Relative and standardized differences
332        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
333        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
334        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
335        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
336
337        # Mean and median relative differences
338        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
339        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
340
341        # Median and median absolute differences
342        self.median_difference          = self.medd  = np.median( diff )
343        self.median_absolute_difference = self.medad = np.median( absdiff )
344
345        # Relative median differences
346        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
347        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
348
349        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
350        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
351
352        # Mean and mean absolute log ratio
353        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
354        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
355        self.std_log10_ratio           = self.stdlr= np.std( log10ratio )
356
357        # Median and median absolute log ratio
358        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
359        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
360
361        # RMS difference
362        self.root_mean_square_difference = self.rmsd   = np.sqrt( np.mean( np.power( diff, 2) ) )
363        # RMS log ratio
364        self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 )))
365
366        # Covariance, correlation
367        self.covariance = np.cov(x,y)[0][1]
368        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
369            np.corrcoef(x,y)[0][1]
370        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
371        self.R2 = self.r2 = self.R**2
372
373    def __getitem__(self,key):
374        '''Accesses attribute values via object['key']'''
375        return getattr(self,key)
376
377    def fitline(self,method='sma',intercept=True,**kwargs):
378        '''Compute bivariate line fit
379        
380        Parameters
381        ----------
382        method : str
383            line fitting method: sma (default), ols, wls, York, sen, siegel
384        intercept : bool
385            defines whether non-zero intercept should be fitted
386        **kwargs 
387            passed to `acgc.stats.sma` (e.g. robust=True)
388
389        Returns
390        -------
391        result : dict
392            dictionary with keys:
393            - slope (float)
394                slope of fitted line
395            - intercept (float)
396                intercept of fitted line
397            - fittedvalues (array (N,))
398                values on fit line
399            - residuals (array (N,))
400                residual from fit line
401        '''
402
403        fitintercept = intercept
404
405        if method.lower()=='sma':
406            fit = sma(  self._x,
407                        self._y,
408                        self._w,
409                        intercept=fitintercept,
410                        **kwargs)
411            slope = fit['slope']
412            intercept= fit['intercept']
413
414        elif method.lower()=='ols':
415            if fitintercept:
416                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
417                                      self._y, rcond=None )
418            else:
419                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
420            slope = ols[0][0]
421            intercept = ols[0][1]
422
423        elif method.lower() in ['theil','sen','theilsen']:
424            fitintercept = True
425            fit = sen( self._x,
426                       self._y,
427                       **kwargs)
428            slope = fit.slope
429            intercept = fit.intercept
430
431        elif method.lower()=='siegel':
432            fitintercept = True
433            siegel = stats.siegelslopes( self._x,
434                                         self._y )
435            slope = siegel.slope
436            intercept = siegel.intercept
437
438        elif method.lower()=='wls':
439            raise NotImplementedError('WLS regression not implemented yet')
440
441        elif method.lower()=='york':
442            raise NotImplementedError('York regression not implemented yet')
443
444        else:
445            raise ValueError('Undefined method '+method)
446
447        line = dict( slope          = slope,
448                     intercept      = intercept,
449                     fittedvalues   = slope * self._x + intercept,
450                     residuals      = self._y - ( slope * self._x + intercept ),
451                     method         = method,
452                     fitintercept   = fitintercept )
453
454        return line
455
456    def slope(self,method='sma',intercept=True,**kwargs):
457        '''Compute slope of bivariate line fit
458        
459        Parameters
460        ----------
461        method : str
462            line fitting method: sma (default), ols, wls
463        intercept : bool
464            defines whether non-zero intercept should be fitted
465        **kwargs 
466            passed to `fitline`
467
468        Returns
469        -------
470        slope : float
471            value of y intercept
472        '''
473        return self.fitline(method,intercept,**kwargs)['slope']
474
475    def intercept(self,method='sma',intercept=True,**kwargs):
476        '''Compute intercept of bivariate line fit
477        
478        Parameters
479        ----------
480        method : str
481            line fitting method: sma (default) or ols
482        intercept : bool
483            defines whether non-zero intercept should be fitted
484        **kwargs 
485            passed to `fitline`
486
487        Returns
488        -------
489        intercept : float
490            value of y intercept
491        '''
492        return self.fitline(method,intercept,**kwargs)['intercept']
493
494    def _expand_variables(self,variables):
495        '''Expand special strings into a list of variables
496        
497        Parameter
498        ---------
499        variables : list or str, default='common'
500            Special strings ("all","common") will be expanded to a list of variables
501            list arguments will not be modified
502
503        Returns
504        -------
505        list 
506            variable names
507        '''
508        if variables is None:
509            variables='common'
510        if variables=='all':
511            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
512                       'MLR','MALR',
513                       'MedD','MedAD','RMedD','RMedAD','MedRD',
514                       'MedLR','MedALR',
515                       'NMBF','NMAEF','RMSD','cov',
516                       'R','R2','spearmanr','slope','intercept',
517                       'fitline','n']
518        elif variables=='common':
519            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n']
520        if not isinstance(variables,list):
521            raise ValueError(
522                'variables must be a list, None, or one of these strings: "all","common"')
523
524        return variables
525
526    def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
527        '''Summarize bivariate statistics into a dict
528
529        Parameters
530        ----------
531        vars : list or str, default='common'
532            names of attribute variables to include in summary
533            names are case insensitive            
534            The following strings are also accepted in place of a list 
535                "all" (displays all variables)
536                "common" (displays all measures of mean difference)
537        fitline_kw : dict, default=None
538            keywords passed to `fitline`
539        floatformat_fiteqn : str, default=floatformat
540            format specifier for slope and intercept (a,b) in y = a x + b
541        
542        Returns
543        -------
544        summary : dict
545            names and values of variables
546        '''
547
548        # List of variables
549        variables = self._expand_variables(variables)
550
551        if fitline_kw is None:
552            fitline_kw = {'method':'sma',
553                          'intercept':True}
554
555        # Construct the dict
556        summary = {}
557        for v in variables:
558            if v in ['slope','intercept']:
559                # These variables are object methods
560                func = getattr(self,v)
561                value = func(**fitline_kw)
562            elif v == 'fitline':
563                line = self.fitline(**fitline_kw)
564                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
565            else:
566                # Retrieve values
567                value = getattr(self,v.lower())
568
569            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
570            summary[v] = value
571
572        return summary
573
574    def summary(self, variables=None, fitline_kw=None,
575                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
576                stringlength=None ):
577        '''Summarize bivariate statistics
578
579        Parameters
580        ----------
581        vars : list or str, default='common'
582            names of attribute variables to include in summary
583            names are case insensitive            
584            The following strings are also accepted in place of a list 
585                "all" (displays all variables)
586                "common" (displays all measures of mean difference)
587        fitline_kw : dict, default=None
588            keywords passed to `fitline`
589        intformat : str, default='{:d}'
590            format specifier for integer values
591        floatformat : str, default='{:.4f}'
592            format specifier for floating point values
593        floatformat_fiteqn : str, default=floatformat
594            format specifier for slope and intercept (a,b) in y = a x + b
595        stringlength : int, default=None
596            length of the variables on output
597            default (None) is to use the length of the longest variable name
598        
599        Returns
600        -------
601        summary : str
602            names and values of variables
603        '''
604        # List of variables
605        variables = self._expand_variables(variables)
606
607        if floatformat_fiteqn is None:
608            floatformat_fiteqn = floatformat
609        if stringlength is None:
610            stringlength = np.max([len(v) for v in variables])
611        stringformat = '{:'+str(stringlength)+'s}'
612
613        # Get a dict containing the needed variables
614        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
615
616        # Extract length of the float numbers from floatformat
617        # import re
618        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
619        #       floatformat )[0] ) ).astype(int)
620
621        # summary = (stringformat+'{:>10s}').format('Variable','Value')
622        summarytext = ''
623        for k,v in summarydict.items():
624            vstr = _number2str(v,intformat,floatformat)
625            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
626
627        return summarytext
628
629    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
630                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
631                          loc=None, loc_units='axes',
632                          **kwargs):
633        '''Display bivariate statistics as a table inset on a plot axis
634
635        Parameters
636        ----------
637        ax : matplotlib.Figure.Axis 
638            axis where the table will be displayed
639        variables : list or str, default='common'
640            names of attribute variables to include in summary
641            names are case insensitive            
642            The following strings are also accepted in place of a list 
643                "all" (displays all variables)
644                "common" (displays all measures of mean difference)
645        fitline_kw : dict, default=None
646            keywords passed to `fitline`
647        intformat : str, default='{:d}'
648            format specifier for integer values
649        floatformat : str, default='{:.3f}'
650            format specifier for floating point values
651        floatformat_fiteqn : str, default=floatformat
652            format specifier for slope and intercept (a,b) in y = a x + b
653        loc : tuple (x0,y0), default=(0.85, 0.05)
654            location on the axis where the table will be drawn
655            can be in data units or axes units [0-1]
656        loc_units : {'axes' (default), 'data'}
657            specifies whether loc has 'data' units or 'axes' units [0-1]
658                    
659        Returns
660        -------
661        text1, text2 : matplotlib text object
662            Artist for the two text boxes        
663        '''
664        # List of variables
665        variables = self._expand_variables(variables)
666
667        if floatformat_fiteqn is None:
668            floatformat_fiteqn = floatformat
669
670        # Default location in lower right corner
671        if loc is None:
672            loc = (0.8,0.05)
673
674        # Coordinates for loc
675        if loc_units.lower()=='data':
676            coord=ax.transData
677        elif loc_units.lower() in ['axes','axis']:
678            coord=ax.transAxes
679        else:
680            raise ValueError('Display units should be "Data" or "Axes"')
681
682        # Get a dict containing the needed variables
683        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
684
685        # Column of label text
686        label_text = '\n'.join([_texify_name(key)
687                                for key in summarydict])
688        # Column of value text
689        value_text = '\n'.join([_number2str(v,intformat,floatformat)
690                                for v in summarydict.values()])
691
692        # Check if horizontal alignment keyword is used
693        ha=''
694        try:
695            ha = kwargs['ha']
696        except KeyError:
697            pass
698        try:
699            ha = kwargs['horizontalalignment']
700        except KeyError:
701            pass
702
703        # For right alignment, align on values first
704        # Otherwise, align on labels
705        if ha=='right':
706            first_text = value_text
707            second_text = label_text
708            sign = -1
709        else:
710            first_text = label_text
711            second_text = value_text
712            sign = +1
713
714        # Add first column of text
715        t1=ax.text(loc[0],loc[1],
716                first_text,
717                transform=coord,
718                **kwargs
719                )
720
721        # Get width of first text column
722        bbox = t1.get_window_extent().transformed(coord.inverted())
723        width = bbox.x1-bbox.x0
724
725        # Add second column of text
726        t2 = ax.text(loc[0]+width*sign,loc[1],
727                     second_text,
728                     transform=coord,
729                     **kwargs
730                     )
731
732        ##################################
733        # Early version of this function using matplotlib.table.table()
734
735        # if isinstance(loc,(tuple,list)):
736        #     # Create an inset axis to contain the table
737        #     tableaxis = ax.inset_axes(loc)
738        #     table_width=1
739        # else:
740        #     tableaxis = ax
741
742        # # Display the table on the axis
743        # return mtable.table(
744        #     tableaxis,
745        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
746        #     rowLabels=[texify_name(key) for key in summarydict],
747        #     colWidths=[table_width/2]*2,
748        #     edges=edges,
749        #     loc=loc, bbox=bbox
750        #     )
751
752        return [t1,t2]
class BivariateStatistics:
188class BivariateStatistics:
189    '''A suite of common statistics to quantify bivariate relationships
190
191    Class method 'summary' provides a formatted summary of these statistics
192    
193    Attributes
194    ----------
195    count, n : int
196        number of valid (not NaN) data value pairs
197    xmean, ymean : float
198        mean of x and y variables
199    xmedian, ymedian :float
200        median of x and y variables
201    xstd, ystd : float
202        standard deviation of x and y variables
203    mean_difference, md : float
204        ymean - xmean
205    std_difference, stdd : float
206        std( y - x )
207    mean_absolute_difference, mad : float
208        mean( |y-x| )
209    relative_mean_difference, rmd : float
210        md / xmean
211    relative_mean_absolute_difference, rmad :float
212        mad / xmean
213    standardized_mean_difference, smd : float
214        md / xstd
215    standardized_mean_absolute_difference, smad : float
216        mad /xstd
217    mean_relative_difference, mrd : float
218        mean(y/x) - 1
219    mean_log10_ratio, mlr : float
220        mean( log10(y/x) )
221    std_log10_ratio, stdlr : float
222        std( log10(y/x) )
223    mean_absolute_log10_ratio, malr : float
224        mean( abs( log10(y/x) ) )
225    median_difference, medd : float
226        median(y-x)
227    median_absolute_difference, medad : float
228        median(|y-x|)
229    relative_median_difference, rmedd : float
230        median(y-x) / xmedian
231    relative_median_absolute_difference, rmedad : float
232        median(|y-x|) / xmedian
233    median_relative_difference, medianrd, medrd : float
234        median(y/x)-1
235    median_log10_ratio, medlr : float
236        median( log10(y/x) )
237    median_absolute_log10_ratio, medalr : float
238        median( abs( log10(y/x) ) )
239    normalized_mean_bias_factor, nmbf : float
240        see `nmbf` 
241    normalized_mean_absolute_error_factor, nmaef : float
242        see `nmaef`
243    root_mean_square_difference, rmsd : float
244        $\\sqrt{ \\langle (y - x)^2 \\rangle }$
245    root_mean_square_log10_ratio, rmslr : float
246        $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$
247    covariance : float
248        cov(x,y)
249    correlation_pearson, correlation, pearsonr, R, r : float
250        Pearson linear correlation coefficient 
251    correlation_spearman, spearmanr : float
252        Spearman, non-parametric rank correlation coefficient
253    R2, r2 : float
254        Linear coefficient of determination, $R^2$
255    '''
256
257    def __init__(self,x,y,w=None,dropna=False,data=None):
258        '''Compute suite of bivariate statistics during initialization
259        
260        Statistic values are saved in attributes.
261        CAUTION: Weights w are ignored except in SMA fit
262
263        Parameters
264        ----------
265        x : ndarray or str
266            independent variable values
267        y : ndarray or str
268            dependent variable values, same size as x
269        w : ndarray or str, optional
270            weights for points (x,y), same size as x and y
271        dropna : bool, optional (default=False)
272            drops NaN values from x, y, and w
273        data : dict-like, optional
274            if x, y, or w are str, then they should be keys in data
275        '''
276
277        # Get values from data if needed
278        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
279            raise ValueError( 'Data argument must be used if x, y, or w is a string')
280        if isinstance(x,str):
281            x = data[x]
282        if isinstance(y,str):
283            y = data[y]
284        if isinstance(w,str):
285            w = data[w]
286
287        #Ensure that x and y have same length
288        if len(x) != len(y):
289            raise ValueError( 'Arguments x and y must have the same length' )
290        if w is None:
291            w = np.ones_like(x)
292        if len(w) != len(x):
293            raise ValueError( 'Argument w (if present) must have the same length as x' )
294
295        # Drop NaN values
296        if dropna:
297            isna = np.isnan(x*y*w)
298            x = x[~isna]
299            y = y[~isna]
300            w = w[~isna]
301
302        # Differences and ratios used repeatedly
303        diff = y - x
304        absdiff = np.abs( y - x )
305        # Ignore divide by zero and 0/0 while dividing
306        old_settings = np.seterr(divide='ignore',invalid='ignore')
307        ratio = y/x
308        log10ratio = np.log10(ratio)
309        np.seterr(**old_settings)
310
311        # Number of data points
312        self.count = self.n = len(x)
313
314        # Means, medians, and standard deviations
315        self.xmean = np.mean(x)
316        self.ymean = np.mean(y)
317        self.xmedian = np.median(x)
318        self.ymedian = np.median(y)
319        self.xstd   = np.std(x)
320        self.ystd   = np.std(y)
321
322        # Save values for use later
323        self._x = x
324        self._y = y
325        self._w = w
326
327        # Mean and mean absolute differences
328        self.mean_difference            = self.md   = self.ymean - self.xmean
329        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
330        self.std_difference             = self.stdd = np.std( diff )
331
332        # Relative and standardized differences
333        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
334        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
335        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
336        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
337
338        # Mean and median relative differences
339        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
340        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
341
342        # Median and median absolute differences
343        self.median_difference          = self.medd  = np.median( diff )
344        self.median_absolute_difference = self.medad = np.median( absdiff )
345
346        # Relative median differences
347        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
348        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
349
350        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
351        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
352
353        # Mean and mean absolute log ratio
354        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
355        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
356        self.std_log10_ratio           = self.stdlr= np.std( log10ratio )
357
358        # Median and median absolute log ratio
359        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
360        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
361
362        # RMS difference
363        self.root_mean_square_difference = self.rmsd   = np.sqrt( np.mean( np.power( diff, 2) ) )
364        # RMS log ratio
365        self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 )))
366
367        # Covariance, correlation
368        self.covariance = np.cov(x,y)[0][1]
369        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
370            np.corrcoef(x,y)[0][1]
371        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
372        self.R2 = self.r2 = self.R**2
373
374    def __getitem__(self,key):
375        '''Accesses attribute values via object['key']'''
376        return getattr(self,key)
377
378    def fitline(self,method='sma',intercept=True,**kwargs):
379        '''Compute bivariate line fit
380        
381        Parameters
382        ----------
383        method : str
384            line fitting method: sma (default), ols, wls, York, sen, siegel
385        intercept : bool
386            defines whether non-zero intercept should be fitted
387        **kwargs 
388            passed to `acgc.stats.sma` (e.g. robust=True)
389
390        Returns
391        -------
392        result : dict
393            dictionary with keys:
394            - slope (float)
395                slope of fitted line
396            - intercept (float)
397                intercept of fitted line
398            - fittedvalues (array (N,))
399                values on fit line
400            - residuals (array (N,))
401                residual from fit line
402        '''
403
404        fitintercept = intercept
405
406        if method.lower()=='sma':
407            fit = sma(  self._x,
408                        self._y,
409                        self._w,
410                        intercept=fitintercept,
411                        **kwargs)
412            slope = fit['slope']
413            intercept= fit['intercept']
414
415        elif method.lower()=='ols':
416            if fitintercept:
417                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
418                                      self._y, rcond=None )
419            else:
420                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
421            slope = ols[0][0]
422            intercept = ols[0][1]
423
424        elif method.lower() in ['theil','sen','theilsen']:
425            fitintercept = True
426            fit = sen( self._x,
427                       self._y,
428                       **kwargs)
429            slope = fit.slope
430            intercept = fit.intercept
431
432        elif method.lower()=='siegel':
433            fitintercept = True
434            siegel = stats.siegelslopes( self._x,
435                                         self._y )
436            slope = siegel.slope
437            intercept = siegel.intercept
438
439        elif method.lower()=='wls':
440            raise NotImplementedError('WLS regression not implemented yet')
441
442        elif method.lower()=='york':
443            raise NotImplementedError('York regression not implemented yet')
444
445        else:
446            raise ValueError('Undefined method '+method)
447
448        line = dict( slope          = slope,
449                     intercept      = intercept,
450                     fittedvalues   = slope * self._x + intercept,
451                     residuals      = self._y - ( slope * self._x + intercept ),
452                     method         = method,
453                     fitintercept   = fitintercept )
454
455        return line
456
457    def slope(self,method='sma',intercept=True,**kwargs):
458        '''Compute slope of bivariate line fit
459        
460        Parameters
461        ----------
462        method : str
463            line fitting method: sma (default), ols, wls
464        intercept : bool
465            defines whether non-zero intercept should be fitted
466        **kwargs 
467            passed to `fitline`
468
469        Returns
470        -------
471        slope : float
472            value of y intercept
473        '''
474        return self.fitline(method,intercept,**kwargs)['slope']
475
476    def intercept(self,method='sma',intercept=True,**kwargs):
477        '''Compute intercept of bivariate line fit
478        
479        Parameters
480        ----------
481        method : str
482            line fitting method: sma (default) or ols
483        intercept : bool
484            defines whether non-zero intercept should be fitted
485        **kwargs 
486            passed to `fitline`
487
488        Returns
489        -------
490        intercept : float
491            value of y intercept
492        '''
493        return self.fitline(method,intercept,**kwargs)['intercept']
494
495    def _expand_variables(self,variables):
496        '''Expand special strings into a list of variables
497        
498        Parameter
499        ---------
500        variables : list or str, default='common'
501            Special strings ("all","common") will be expanded to a list of variables
502            list arguments will not be modified
503
504        Returns
505        -------
506        list 
507            variable names
508        '''
509        if variables is None:
510            variables='common'
511        if variables=='all':
512            variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD',
513                       'MLR','MALR',
514                       'MedD','MedAD','RMedD','RMedAD','MedRD',
515                       'MedLR','MedALR',
516                       'NMBF','NMAEF','RMSD','cov',
517                       'R','R2','spearmanr','slope','intercept',
518                       'fitline','n']
519        elif variables=='common':
520            variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n']
521        if not isinstance(variables,list):
522            raise ValueError(
523                'variables must be a list, None, or one of these strings: "all","common"')
524
525        return variables
526
527    def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
528        '''Summarize bivariate statistics into a dict
529
530        Parameters
531        ----------
532        vars : list or str, default='common'
533            names of attribute variables to include in summary
534            names are case insensitive            
535            The following strings are also accepted in place of a list 
536                "all" (displays all variables)
537                "common" (displays all measures of mean difference)
538        fitline_kw : dict, default=None
539            keywords passed to `fitline`
540        floatformat_fiteqn : str, default=floatformat
541            format specifier for slope and intercept (a,b) in y = a x + b
542        
543        Returns
544        -------
545        summary : dict
546            names and values of variables
547        '''
548
549        # List of variables
550        variables = self._expand_variables(variables)
551
552        if fitline_kw is None:
553            fitline_kw = {'method':'sma',
554                          'intercept':True}
555
556        # Construct the dict
557        summary = {}
558        for v in variables:
559            if v in ['slope','intercept']:
560                # These variables are object methods
561                func = getattr(self,v)
562                value = func(**fitline_kw)
563            elif v == 'fitline':
564                line = self.fitline(**fitline_kw)
565                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
566            else:
567                # Retrieve values
568                value = getattr(self,v.lower())
569
570            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
571            summary[v] = value
572
573        return summary
574
575    def summary(self, variables=None, fitline_kw=None,
576                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
577                stringlength=None ):
578        '''Summarize bivariate statistics
579
580        Parameters
581        ----------
582        vars : list or str, default='common'
583            names of attribute variables to include in summary
584            names are case insensitive            
585            The following strings are also accepted in place of a list 
586                "all" (displays all variables)
587                "common" (displays all measures of mean difference)
588        fitline_kw : dict, default=None
589            keywords passed to `fitline`
590        intformat : str, default='{:d}'
591            format specifier for integer values
592        floatformat : str, default='{:.4f}'
593            format specifier for floating point values
594        floatformat_fiteqn : str, default=floatformat
595            format specifier for slope and intercept (a,b) in y = a x + b
596        stringlength : int, default=None
597            length of the variables on output
598            default (None) is to use the length of the longest variable name
599        
600        Returns
601        -------
602        summary : str
603            names and values of variables
604        '''
605        # List of variables
606        variables = self._expand_variables(variables)
607
608        if floatformat_fiteqn is None:
609            floatformat_fiteqn = floatformat
610        if stringlength is None:
611            stringlength = np.max([len(v) for v in variables])
612        stringformat = '{:'+str(stringlength)+'s}'
613
614        # Get a dict containing the needed variables
615        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
616
617        # Extract length of the float numbers from floatformat
618        # import re
619        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
620        #       floatformat )[0] ) ).astype(int)
621
622        # summary = (stringformat+'{:>10s}').format('Variable','Value')
623        summarytext = ''
624        for k,v in summarydict.items():
625            vstr = _number2str(v,intformat,floatformat)
626            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
627
628        return summarytext
629
630    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
631                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
632                          loc=None, loc_units='axes',
633                          **kwargs):
634        '''Display bivariate statistics as a table inset on a plot axis
635
636        Parameters
637        ----------
638        ax : matplotlib.Figure.Axis 
639            axis where the table will be displayed
640        variables : list or str, default='common'
641            names of attribute variables to include in summary
642            names are case insensitive            
643            The following strings are also accepted in place of a list 
644                "all" (displays all variables)
645                "common" (displays all measures of mean difference)
646        fitline_kw : dict, default=None
647            keywords passed to `fitline`
648        intformat : str, default='{:d}'
649            format specifier for integer values
650        floatformat : str, default='{:.3f}'
651            format specifier for floating point values
652        floatformat_fiteqn : str, default=floatformat
653            format specifier for slope and intercept (a,b) in y = a x + b
654        loc : tuple (x0,y0), default=(0.85, 0.05)
655            location on the axis where the table will be drawn
656            can be in data units or axes units [0-1]
657        loc_units : {'axes' (default), 'data'}
658            specifies whether loc has 'data' units or 'axes' units [0-1]
659                    
660        Returns
661        -------
662        text1, text2 : matplotlib text object
663            Artist for the two text boxes        
664        '''
665        # List of variables
666        variables = self._expand_variables(variables)
667
668        if floatformat_fiteqn is None:
669            floatformat_fiteqn = floatformat
670
671        # Default location in lower right corner
672        if loc is None:
673            loc = (0.8,0.05)
674
675        # Coordinates for loc
676        if loc_units.lower()=='data':
677            coord=ax.transData
678        elif loc_units.lower() in ['axes','axis']:
679            coord=ax.transAxes
680        else:
681            raise ValueError('Display units should be "Data" or "Axes"')
682
683        # Get a dict containing the needed variables
684        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
685
686        # Column of label text
687        label_text = '\n'.join([_texify_name(key)
688                                for key in summarydict])
689        # Column of value text
690        value_text = '\n'.join([_number2str(v,intformat,floatformat)
691                                for v in summarydict.values()])
692
693        # Check if horizontal alignment keyword is used
694        ha=''
695        try:
696            ha = kwargs['ha']
697        except KeyError:
698            pass
699        try:
700            ha = kwargs['horizontalalignment']
701        except KeyError:
702            pass
703
704        # For right alignment, align on values first
705        # Otherwise, align on labels
706        if ha=='right':
707            first_text = value_text
708            second_text = label_text
709            sign = -1
710        else:
711            first_text = label_text
712            second_text = value_text
713            sign = +1
714
715        # Add first column of text
716        t1=ax.text(loc[0],loc[1],
717                first_text,
718                transform=coord,
719                **kwargs
720                )
721
722        # Get width of first text column
723        bbox = t1.get_window_extent().transformed(coord.inverted())
724        width = bbox.x1-bbox.x0
725
726        # Add second column of text
727        t2 = ax.text(loc[0]+width*sign,loc[1],
728                     second_text,
729                     transform=coord,
730                     **kwargs
731                     )
732
733        ##################################
734        # Early version of this function using matplotlib.table.table()
735
736        # if isinstance(loc,(tuple,list)):
737        #     # Create an inset axis to contain the table
738        #     tableaxis = ax.inset_axes(loc)
739        #     table_width=1
740        # else:
741        #     tableaxis = ax
742
743        # # Display the table on the axis
744        # return mtable.table(
745        #     tableaxis,
746        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
747        #     rowLabels=[texify_name(key) for key in summarydict],
748        #     colWidths=[table_width/2]*2,
749        #     edges=edges,
750        #     loc=loc, bbox=bbox
751        #     )
752
753        return [t1,t2]

A suite of common statistics to quantify bivariate relationships

Class method 'summary' provides a formatted summary of these statistics

Attributes
  • count, n (int): number of valid (not NaN) data value pairs
  • xmean, ymean (float): mean of x and y variables
  • xmedian, ymedian (float): median of x and y variables
  • xstd, ystd (float): standard deviation of x and y variables
  • mean_difference, md (float): ymean - xmean
  • std_difference, stdd (float): std( y - x )
  • mean_absolute_difference, mad (float): mean( |y-x| )
  • relative_mean_difference, rmd (float): md / xmean
  • relative_mean_absolute_difference, rmad (float): mad / xmean
  • standardized_mean_difference, smd (float): md / xstd
  • standardized_mean_absolute_difference, smad (float): mad /xstd
  • mean_relative_difference, mrd (float): mean(y/x) - 1
  • mean_log10_ratio, mlr (float): mean( log10(y/x) )
  • std_log10_ratio, stdlr (float): std( log10(y/x) )
  • mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
  • median_difference, medd (float): median(y-x)
  • median_absolute_difference, medad (float): median(|y-x|)
  • relative_median_difference, rmedd (float): median(y-x) / xmedian
  • relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
  • median_relative_difference, medianrd, medrd (float): median(y/x)-1
  • median_log10_ratio, medlr (float): median( log10(y/x) )
  • median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
  • normalized_mean_bias_factor, nmbf (float): see nmbf
  • normalized_mean_absolute_error_factor, nmaef (float): see nmaef
  • root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
  • root_mean_square_log10_ratio, rmslr (float): $\sqrt{ \langle \log_{10}(y/x)^2 \rangle }$
  • covariance (float): cov(x,y)
  • correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
  • correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
  • R2, r2 (float): Linear coefficient of determination, $R^2$
BivariateStatistics(x, y, w=None, dropna=False, data=None)
257    def __init__(self,x,y,w=None,dropna=False,data=None):
258        '''Compute suite of bivariate statistics during initialization
259        
260        Statistic values are saved in attributes.
261        CAUTION: Weights w are ignored except in SMA fit
262
263        Parameters
264        ----------
265        x : ndarray or str
266            independent variable values
267        y : ndarray or str
268            dependent variable values, same size as x
269        w : ndarray or str, optional
270            weights for points (x,y), same size as x and y
271        dropna : bool, optional (default=False)
272            drops NaN values from x, y, and w
273        data : dict-like, optional
274            if x, y, or w are str, then they should be keys in data
275        '''
276
277        # Get values from data if needed
278        if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)):
279            raise ValueError( 'Data argument must be used if x, y, or w is a string')
280        if isinstance(x,str):
281            x = data[x]
282        if isinstance(y,str):
283            y = data[y]
284        if isinstance(w,str):
285            w = data[w]
286
287        #Ensure that x and y have same length
288        if len(x) != len(y):
289            raise ValueError( 'Arguments x and y must have the same length' )
290        if w is None:
291            w = np.ones_like(x)
292        if len(w) != len(x):
293            raise ValueError( 'Argument w (if present) must have the same length as x' )
294
295        # Drop NaN values
296        if dropna:
297            isna = np.isnan(x*y*w)
298            x = x[~isna]
299            y = y[~isna]
300            w = w[~isna]
301
302        # Differences and ratios used repeatedly
303        diff = y - x
304        absdiff = np.abs( y - x )
305        # Ignore divide by zero and 0/0 while dividing
306        old_settings = np.seterr(divide='ignore',invalid='ignore')
307        ratio = y/x
308        log10ratio = np.log10(ratio)
309        np.seterr(**old_settings)
310
311        # Number of data points
312        self.count = self.n = len(x)
313
314        # Means, medians, and standard deviations
315        self.xmean = np.mean(x)
316        self.ymean = np.mean(y)
317        self.xmedian = np.median(x)
318        self.ymedian = np.median(y)
319        self.xstd   = np.std(x)
320        self.ystd   = np.std(y)
321
322        # Save values for use later
323        self._x = x
324        self._y = y
325        self._w = w
326
327        # Mean and mean absolute differences
328        self.mean_difference            = self.md   = self.ymean - self.xmean
329        self.mean_absolute_difference   = self.mad  = np.mean( absdiff )
330        self.std_difference             = self.stdd = np.std( diff )
331
332        # Relative and standardized differences
333        self.relative_mean_difference           = self.rmd  = self.mean_difference / self.xmean
334        self.relative_mean_absolute_difference  = self.rmad = self.mean_absolute_difference / self.xmean
335        self.standardized_mean_difference       = self.smd  = self.mean_difference / self.xstd
336        self.standardized_mean_absolute_difference  = self.smad = self.mean_absolute_difference / self.xstd
337
338        # Mean and median relative differences
339        self.mean_relative_difference   = self.mrd  = np.mean( ratio - 1 )
340        self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 )
341
342        # Median and median absolute differences
343        self.median_difference          = self.medd  = np.median( diff )
344        self.median_absolute_difference = self.medad = np.median( absdiff )
345
346        # Relative median differences
347        self.relative_median_difference          = self.rmedd  = self.median_difference / self.xmedian
348        self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian
349
350        self.normalized_mean_bias_factor            = self.nmbf  = nmbf(x,y)
351        self.normalized_mean_absolute_error_factor  = self.nmaef = nmaef(x,y)
352
353        # Mean and mean absolute log ratio
354        self.mean_log10_ratio          = self.mlr  = np.mean( log10ratio )
355        self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) )
356        self.std_log10_ratio           = self.stdlr= np.std( log10ratio )
357
358        # Median and median absolute log ratio
359        self.median_log10_ratio          = self.medlr  = np.median( log10ratio )
360        self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) )
361
362        # RMS difference
363        self.root_mean_square_difference = self.rmsd   = np.sqrt( np.mean( np.power( diff, 2) ) )
364        # RMS log ratio
365        self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 )))
366
367        # Covariance, correlation
368        self.covariance = np.cov(x,y)[0][1]
369        self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \
370            np.corrcoef(x,y)[0][1]
371        self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic
372        self.R2 = self.r2 = self.R**2

Compute suite of bivariate statistics during initialization

Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit

Parameters
  • x (ndarray or str): independent variable values
  • y (ndarray or str): dependent variable values, same size as x
  • w (ndarray or str, optional): weights for points (x,y), same size as x and y
  • dropna (bool, optional (default=False)): drops NaN values from x, y, and w
  • data (dict-like, optional): if x, y, or w are str, then they should be keys in data
xmean
ymean
xmedian
ymedian
xstd
ystd
covariance
def fitline(self, method='sma', intercept=True, **kwargs):
378    def fitline(self,method='sma',intercept=True,**kwargs):
379        '''Compute bivariate line fit
380        
381        Parameters
382        ----------
383        method : str
384            line fitting method: sma (default), ols, wls, York, sen, siegel
385        intercept : bool
386            defines whether non-zero intercept should be fitted
387        **kwargs 
388            passed to `acgc.stats.sma` (e.g. robust=True)
389
390        Returns
391        -------
392        result : dict
393            dictionary with keys:
394            - slope (float)
395                slope of fitted line
396            - intercept (float)
397                intercept of fitted line
398            - fittedvalues (array (N,))
399                values on fit line
400            - residuals (array (N,))
401                residual from fit line
402        '''
403
404        fitintercept = intercept
405
406        if method.lower()=='sma':
407            fit = sma(  self._x,
408                        self._y,
409                        self._w,
410                        intercept=fitintercept,
411                        **kwargs)
412            slope = fit['slope']
413            intercept= fit['intercept']
414
415        elif method.lower()=='ols':
416            if fitintercept:
417                ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T,
418                                      self._y, rcond=None )
419            else:
420                ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None )
421            slope = ols[0][0]
422            intercept = ols[0][1]
423
424        elif method.lower() in ['theil','sen','theilsen']:
425            fitintercept = True
426            fit = sen( self._x,
427                       self._y,
428                       **kwargs)
429            slope = fit.slope
430            intercept = fit.intercept
431
432        elif method.lower()=='siegel':
433            fitintercept = True
434            siegel = stats.siegelslopes( self._x,
435                                         self._y )
436            slope = siegel.slope
437            intercept = siegel.intercept
438
439        elif method.lower()=='wls':
440            raise NotImplementedError('WLS regression not implemented yet')
441
442        elif method.lower()=='york':
443            raise NotImplementedError('York regression not implemented yet')
444
445        else:
446            raise ValueError('Undefined method '+method)
447
448        line = dict( slope          = slope,
449                     intercept      = intercept,
450                     fittedvalues   = slope * self._x + intercept,
451                     residuals      = self._y - ( slope * self._x + intercept ),
452                     method         = method,
453                     fitintercept   = fitintercept )
454
455        return line

Compute bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to acgc.stats.sma (e.g. robust=True)
Returns
  • result (dict): dictionary with keys:
    • slope (float) slope of fitted line
    • intercept (float) intercept of fitted line
    • fittedvalues (array (N,)) values on fit line
    • residuals (array (N,)) residual from fit line
def slope(self, method='sma', intercept=True, **kwargs):
457    def slope(self,method='sma',intercept=True,**kwargs):
458        '''Compute slope of bivariate line fit
459        
460        Parameters
461        ----------
462        method : str
463            line fitting method: sma (default), ols, wls
464        intercept : bool
465            defines whether non-zero intercept should be fitted
466        **kwargs 
467            passed to `fitline`
468
469        Returns
470        -------
471        slope : float
472            value of y intercept
473        '''
474        return self.fitline(method,intercept,**kwargs)['slope']

Compute slope of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default), ols, wls
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • slope (float): value of y intercept
def intercept(self, method='sma', intercept=True, **kwargs):
476    def intercept(self,method='sma',intercept=True,**kwargs):
477        '''Compute intercept of bivariate line fit
478        
479        Parameters
480        ----------
481        method : str
482            line fitting method: sma (default) or ols
483        intercept : bool
484            defines whether non-zero intercept should be fitted
485        **kwargs 
486            passed to `fitline`
487
488        Returns
489        -------
490        intercept : float
491            value of y intercept
492        '''
493        return self.fitline(method,intercept,**kwargs)['intercept']

Compute intercept of bivariate line fit

Parameters
  • method (str): line fitting method: sma (default) or ols
  • intercept (bool): defines whether non-zero intercept should be fitted
  • **kwargs: passed to fitline
Returns
  • intercept (float): value of y intercept
def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
527    def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'):
528        '''Summarize bivariate statistics into a dict
529
530        Parameters
531        ----------
532        vars : list or str, default='common'
533            names of attribute variables to include in summary
534            names are case insensitive            
535            The following strings are also accepted in place of a list 
536                "all" (displays all variables)
537                "common" (displays all measures of mean difference)
538        fitline_kw : dict, default=None
539            keywords passed to `fitline`
540        floatformat_fiteqn : str, default=floatformat
541            format specifier for slope and intercept (a,b) in y = a x + b
542        
543        Returns
544        -------
545        summary : dict
546            names and values of variables
547        '''
548
549        # List of variables
550        variables = self._expand_variables(variables)
551
552        if fitline_kw is None:
553            fitline_kw = {'method':'sma',
554                          'intercept':True}
555
556        # Construct the dict
557        summary = {}
558        for v in variables:
559            if v in ['slope','intercept']:
560                # These variables are object methods
561                func = getattr(self,v)
562                value = func(**fitline_kw)
563            elif v == 'fitline':
564                line = self.fitline(**fitline_kw)
565                v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate')
566            else:
567                # Retrieve values
568                value = getattr(self,v.lower())
569
570            # summary += (stringformat+'='+floatformat+'\n').format(v,value)
571            summary[v] = value
572
573        return summary

Summarize bivariate statistics into a dict

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
Returns
  • summary (dict): names and values of variables
def summary( self, variables=None, fitline_kw=None, intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, stringlength=None):
575    def summary(self, variables=None, fitline_kw=None,
576                intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None,
577                stringlength=None ):
578        '''Summarize bivariate statistics
579
580        Parameters
581        ----------
582        vars : list or str, default='common'
583            names of attribute variables to include in summary
584            names are case insensitive            
585            The following strings are also accepted in place of a list 
586                "all" (displays all variables)
587                "common" (displays all measures of mean difference)
588        fitline_kw : dict, default=None
589            keywords passed to `fitline`
590        intformat : str, default='{:d}'
591            format specifier for integer values
592        floatformat : str, default='{:.4f}'
593            format specifier for floating point values
594        floatformat_fiteqn : str, default=floatformat
595            format specifier for slope and intercept (a,b) in y = a x + b
596        stringlength : int, default=None
597            length of the variables on output
598            default (None) is to use the length of the longest variable name
599        
600        Returns
601        -------
602        summary : str
603            names and values of variables
604        '''
605        # List of variables
606        variables = self._expand_variables(variables)
607
608        if floatformat_fiteqn is None:
609            floatformat_fiteqn = floatformat
610        if stringlength is None:
611            stringlength = np.max([len(v) for v in variables])
612        stringformat = '{:'+str(stringlength)+'s}'
613
614        # Get a dict containing the needed variables
615        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
616
617        # Extract length of the float numbers from floatformat
618        # import re
619        # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)",
620        #       floatformat )[0] ) ).astype(int)
621
622        # summary = (stringformat+'{:>10s}').format('Variable','Value')
623        summarytext = ''
624        for k,v in summarydict.items():
625            vstr = _number2str(v,intformat,floatformat)
626            summarytext += (stringformat+' = {:s}\n').format(k,vstr)
627
628        return summarytext

Summarize bivariate statistics

Parameters
  • vars (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • intformat : str, default='{ (d}'): format specifier for integer values
  • floatformat : str, default='{ (.4f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
Returns
  • summary (str): names and values of variables
def summary_fig_inset( self, ax, variables=None, fitline_kw=None, intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, loc=None, loc_units='axes', **kwargs):
630    def summary_fig_inset(self, ax, variables=None, fitline_kw=None,
631                          intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None,
632                          loc=None, loc_units='axes',
633                          **kwargs):
634        '''Display bivariate statistics as a table inset on a plot axis
635
636        Parameters
637        ----------
638        ax : matplotlib.Figure.Axis 
639            axis where the table will be displayed
640        variables : list or str, default='common'
641            names of attribute variables to include in summary
642            names are case insensitive            
643            The following strings are also accepted in place of a list 
644                "all" (displays all variables)
645                "common" (displays all measures of mean difference)
646        fitline_kw : dict, default=None
647            keywords passed to `fitline`
648        intformat : str, default='{:d}'
649            format specifier for integer values
650        floatformat : str, default='{:.3f}'
651            format specifier for floating point values
652        floatformat_fiteqn : str, default=floatformat
653            format specifier for slope and intercept (a,b) in y = a x + b
654        loc : tuple (x0,y0), default=(0.85, 0.05)
655            location on the axis where the table will be drawn
656            can be in data units or axes units [0-1]
657        loc_units : {'axes' (default), 'data'}
658            specifies whether loc has 'data' units or 'axes' units [0-1]
659                    
660        Returns
661        -------
662        text1, text2 : matplotlib text object
663            Artist for the two text boxes        
664        '''
665        # List of variables
666        variables = self._expand_variables(variables)
667
668        if floatformat_fiteqn is None:
669            floatformat_fiteqn = floatformat
670
671        # Default location in lower right corner
672        if loc is None:
673            loc = (0.8,0.05)
674
675        # Coordinates for loc
676        if loc_units.lower()=='data':
677            coord=ax.transData
678        elif loc_units.lower() in ['axes','axis']:
679            coord=ax.transAxes
680        else:
681            raise ValueError('Display units should be "Data" or "Axes"')
682
683        # Get a dict containing the needed variables
684        summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn )
685
686        # Column of label text
687        label_text = '\n'.join([_texify_name(key)
688                                for key in summarydict])
689        # Column of value text
690        value_text = '\n'.join([_number2str(v,intformat,floatformat)
691                                for v in summarydict.values()])
692
693        # Check if horizontal alignment keyword is used
694        ha=''
695        try:
696            ha = kwargs['ha']
697        except KeyError:
698            pass
699        try:
700            ha = kwargs['horizontalalignment']
701        except KeyError:
702            pass
703
704        # For right alignment, align on values first
705        # Otherwise, align on labels
706        if ha=='right':
707            first_text = value_text
708            second_text = label_text
709            sign = -1
710        else:
711            first_text = label_text
712            second_text = value_text
713            sign = +1
714
715        # Add first column of text
716        t1=ax.text(loc[0],loc[1],
717                first_text,
718                transform=coord,
719                **kwargs
720                )
721
722        # Get width of first text column
723        bbox = t1.get_window_extent().transformed(coord.inverted())
724        width = bbox.x1-bbox.x0
725
726        # Add second column of text
727        t2 = ax.text(loc[0]+width*sign,loc[1],
728                     second_text,
729                     transform=coord,
730                     **kwargs
731                     )
732
733        ##################################
734        # Early version of this function using matplotlib.table.table()
735
736        # if isinstance(loc,(tuple,list)):
737        #     # Create an inset axis to contain the table
738        #     tableaxis = ax.inset_axes(loc)
739        #     table_width=1
740        # else:
741        #     tableaxis = ax
742
743        # # Display the table on the axis
744        # return mtable.table(
745        #     tableaxis,
746        #     cellText=[[floatformat.format(value)] for value in summarydict.values()],
747        #     rowLabels=[texify_name(key) for key in summarydict],
748        #     colWidths=[table_width/2]*2,
749        #     edges=edges,
750        #     loc=loc, bbox=bbox
751        #     )
752
753        return [t1,t2]

Display bivariate statistics as a table inset on a plot axis

Parameters
  • ax (matplotlib.Figure.Axis): axis where the table will be displayed
  • variables (list or str, default='common'): names of attribute variables to include in summary names are case insensitive
    The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference)
  • fitline_kw (dict, default=None): keywords passed to fitline
  • intformat : str, default='{ (d}'): format specifier for integer values
  • floatformat : str, default='{ (.3f}'): format specifier for floating point values
  • floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
  • loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
  • loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
  • text1, text2 (matplotlib text object): Artist for the two text boxes
def nmb(x0, x1):
22def nmb( x0, x1 ):
23    '''Compute Normalized Mean Bias (NMB)
24
25    NMB = ( mean(x1) - mean(x0) ) / mean(x0)
26
27    Parameters
28    ----------
29    x0 : array_like
30        reference values
31    x1 : array_like
32        experiment values
33    '''
34
35    assert (len(x0) == len(x1)), \
36        "Parameters x0 and x1 must have the same length"
37
38    # Mean values
39    x0_mean = np.mean(x0)
40    x1_mean = np.mean(x1)
41
42    # Metric value
43    return x1_mean / x0_mean - 1

Compute Normalized Mean Bias (NMB)

NMB = ( mean(x1) - mean(x0) ) / mean(x0)

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmae(x0, x1):
45def nmae( x0, x1 ):
46    '''Compute Normalized Mean Absolute Error (NMAE)
47
48    NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
49
50    Parameters
51    ---------
52    x0 : array_like
53        reference values
54    x1 : array_like
55        experiment values
56    '''
57
58     # Mean values
59    x0_mean = np.mean(x0)
60
61    # Mean absolute difference
62    abs_diff = np.mean( np.abs(x1 - x0) )
63
64    # Metric value
65    return abs_diff / np.abs( x0_mean )

Compute Normalized Mean Absolute Error (NMAE)

NMAE = mean(abs(x1 - x0)) / abs(mean(x0))

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmbf(x0, x1):
68def nmbf( x0, x1 ):
69    '''Compute Normalized Mean Bias Factor (NMBF)
70
71    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
72
73    Parameters
74    ----------
75    x0 : array_like
76        reference values
77    x1 : array_like
78        experiment values
79    '''
80
81    # Ensure that arguments have the same length
82    assert (len(x0) == len(x1)), \
83        "Parameters x0 and x1 must have the same length"
84
85    # Mean values
86    x0_mean = np.mean(x0)
87    x1_mean = np.mean(x1)
88
89    # Metric value
90    if x1_mean >= x0_mean:
91        result = x1_mean / x0_mean - 1
92    else:
93        result= 1 - x0_mean / x1_mean
94    # Equivalent (faster?) implementation
95    #S = (mMean - oMean) / np.abs(mMean - oMean)
96    #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 )
97
98    return result

Compute Normalized Mean Bias Factor (NMBF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values
def nmaef(x0, x1):
100def nmaef( x0, x1 ):
101    '''Compute Normalized Mean Absolute Error Factor (NMAEF)
102
103    Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
104    
105    Parameters
106    ----------
107    x0 : array_like
108        reference values
109    x1 : array_like
110        experiment values
111    '''
112
113    # Ensure that arguments have the same length
114    assert (len(x0) == len(x1)), \
115        "Parameters x0 and x1 must have the same length"
116
117    # Mean values
118    x0_mean = np.mean(x0)
119    x1_mean = np.mean(x1)
120
121    # Mean absolute difference
122    abs_diff = np.mean( np.abs(x1 - x0))
123
124    # Metric value
125    if x1_mean >= x0_mean:
126        result = abs_diff / x0_mean 
127    else:
128        result = abs_diff / x1_mean
129    # Equivalent (faster?) implementation
130    #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean)
131    #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) )
132
133    return result

Compute Normalized Mean Absolute Error Factor (NMAEF)

Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125

Parameters
  • x0 (array_like): reference values
  • x1 (array_like): experiment values