acgc.stats.bivariate
Bivariate statistics
Statistical measures of relationships between two populations
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3""" Bivariate statistics 4 5Statistical measures of relationships between two populations 6""" 7 8import numpy as np 9from scipy import stats 10from .bivariate_lines import sen, sma, bivariate_line_equation 11# import xarray as xr 12 13__all__ = [ 14 "BivariateStatistics", 15 "nmb", 16 "nmae", 17 "nmbf", 18 "nmaef" 19] 20 21def nmb( x0, x1 ): 22 '''Compute Normalized Mean Bias (NMB) 23 24 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 25 26 Parameters 27 ---------- 28 x0 : array_like 29 reference values 30 x1 : array_like 31 experiment values 32 ''' 33 34 assert (len(x0) == len(x1)), \ 35 "Parameters x0 and x1 must have the same length" 36 37 # Mean values 38 x0_mean = np.mean(x0) 39 x1_mean = np.mean(x1) 40 41 # Metric value 42 return x1_mean / x0_mean - 1 43 44def nmae( x0, x1 ): 45 '''Compute Normalized Mean Absolute Error (NMAE) 46 47 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 48 49 Parameters 50 --------- 51 x0 : array_like 52 reference values 53 x1 : array_like 54 experiment values 55 ''' 56 57 # Mean values 58 x0_mean = np.mean(x0) 59 60 # Mean absolute difference 61 abs_diff = np.mean( np.abs(x1 - x0) ) 62 63 # Metric value 64 return abs_diff / np.abs( x0_mean ) 65 66 67def nmbf( x0, x1 ): 68 '''Compute Normalized Mean Bias Factor (NMBF) 69 70 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 71 72 Parameters 73 ---------- 74 x0 : array_like 75 reference values 76 x1 : array_like 77 experiment values 78 ''' 79 80 # Ensure that arguments have the same length 81 assert (len(x0) == len(x1)), \ 82 "Parameters x0 and x1 must have the same length" 83 84 # Mean values 85 x0_mean = np.mean(x0) 86 x1_mean = np.mean(x1) 87 88 # Metric value 89 if x1_mean >= x0_mean: 90 result = x1_mean / x0_mean - 1 91 else: 92 result= 1 - x0_mean / x1_mean 93 # Equivalent (faster?) implementation 94 #S = (mMean - oMean) / np.abs(mMean - oMean) 95 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 96 97 return result 98 99def nmaef( x0, x1 ): 100 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 101 102 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 103 104 Parameters 105 ---------- 106 x0 : array_like 107 reference values 108 x1 : array_like 109 experiment values 110 ''' 111 112 # Ensure that arguments have the same length 113 assert (len(x0) == len(x1)), \ 114 "Parameters x0 and x1 must have the same length" 115 116 # Mean values 117 x0_mean = np.mean(x0) 118 x1_mean = np.mean(x1) 119 120 # Mean absolute difference 121 abs_diff = np.mean( np.abs(x1 - x0)) 122 123 # Metric value 124 if x1_mean >= x0_mean: 125 result = abs_diff / x0_mean 126 else: 127 result = abs_diff / x1_mean 128 # Equivalent (faster?) implementation 129 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 130 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 131 132 return result 133 134def _texify_name(name): 135 '''Return a LaTex formatted string for some variables 136 137 Parameters 138 ---------- 139 name : str 140 141 Returns 142 ------- 143 pretty_name : str 144 ''' 145 if name.lower()=='n': 146 pretty_name = r'$n$' 147 elif name=='R2': 148 pretty_name = f'$R^2$' 149 elif name=='r2': 150 pretty_name = f'$r^2$' 151 elif name.lower()=='y_ols': 152 pretty_name = r'$y_{\rm OLS}$' 153 elif name.lower()=='y_sma': 154 pretty_name = r'$y_{\rm SMA}$' 155 elif name.lower()=='y_sen': 156 pretty_name = r'$y_{\rm Sen}$' 157 else: 158 pretty_name = name 159 return pretty_name 160 161def _number2str(value, 162 intformat='{:d}', 163 floatformat='{:.4f}'): 164 '''Format number as string using integer and float format specifiers 165 166 Parameters 167 ---------- 168 value : numeric, str 169 value to be converted 170 intformat : str, default='{:d}' 171 format specifier for integer types 172 floatformat : str, default='{:.4f}' 173 format specifier for float types 174 175 Returns 176 ------- 177 str 178 ''' 179 if isinstance(value,str): 180 pass 181 elif isinstance(value,(int,np.integer)): 182 value = intformat.format(value) 183 else: 184 value = floatformat.format(value) 185 return value 186 187class BivariateStatistics: 188 '''A suite of common statistics to quantify bivariate relationships 189 190 Class method 'summary' provides a formatted summary of these statistics 191 192 Attributes 193 ---------- 194 count, n : int 195 number of valid (not NaN) data value pairs 196 xmean, ymean : float 197 mean of x and y variables 198 xmedian, ymedian :float 199 median of x and y variables 200 xstd, ystd : float 201 standard deviation of x and y variables 202 mean_difference, md : float 203 ymean - xmean 204 std_difference, stdd : float 205 std( y - x ) 206 mean_absolute_difference, mad : float 207 mean( |y-x| ) 208 relative_mean_difference, rmd : float 209 md / xmean 210 relative_mean_absolute_difference, rmad :float 211 mad / xmean 212 standardized_mean_difference, smd : float 213 md / xstd 214 standardized_mean_absolute_difference, smad : float 215 mad /xstd 216 mean_relative_difference, mrd : float 217 mean(y/x) - 1 218 mean_log10_ratio, mlr : float 219 mean( log10(y/x) ) 220 std_log10_ratio, stdlr : float 221 std( log10(y/x) ) 222 mean_absolute_log10_ratio, malr : float 223 mean( abs( log10(y/x) ) ) 224 median_difference, medd : float 225 median(y-x) 226 median_absolute_difference, medad : float 227 median(|y-x|) 228 relative_median_difference, rmedd : float 229 median(y-x) / xmedian 230 relative_median_absolute_difference, rmedad : float 231 median(|y-x|) / xmedian 232 median_relative_difference, medianrd, medrd : float 233 median(y/x)-1 234 median_log10_ratio, medlr : float 235 median( log10(y/x) ) 236 median_absolute_log10_ratio, medalr : float 237 median( abs( log10(y/x) ) ) 238 normalized_mean_bias_factor, nmbf : float 239 see `nmbf` 240 normalized_mean_absolute_error_factor, nmaef : float 241 see `nmaef` 242 root_mean_square_difference, rmsd : float 243 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 244 root_mean_square_log10_ratio, rmslr : float 245 $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$ 246 covariance : float 247 cov(x,y) 248 correlation_pearson, correlation, pearsonr, R, r : float 249 Pearson linear correlation coefficient 250 correlation_spearman, spearmanr : float 251 Spearman, non-parametric rank correlation coefficient 252 R2, r2 : float 253 Linear coefficient of determination, $R^2$ 254 ''' 255 256 def __init__(self,x,y,w=None,dropna=False,data=None): 257 '''Compute suite of bivariate statistics during initialization 258 259 Statistic values are saved in attributes. 260 CAUTION: Weights w are ignored except in SMA fit 261 262 Parameters 263 ---------- 264 x : ndarray or str 265 independent variable values 266 y : ndarray or str 267 dependent variable values, same size as x 268 w : ndarray or str, optional 269 weights for points (x,y), same size as x and y 270 dropna : bool, optional (default=False) 271 drops NaN values from x, y, and w 272 data : dict-like, optional 273 if x, y, or w are str, then they should be keys in data 274 ''' 275 276 # Get values from data if needed 277 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 278 raise ValueError( 'Data argument must be used if x, y, or w is a string') 279 if isinstance(x,str): 280 x = data[x] 281 if isinstance(y,str): 282 y = data[y] 283 if isinstance(w,str): 284 w = data[w] 285 286 #Ensure that x and y have same length 287 if len(x) != len(y): 288 raise ValueError( 'Arguments x and y must have the same length' ) 289 if w is None: 290 w = np.ones_like(x) 291 if len(w) != len(x): 292 raise ValueError( 'Argument w (if present) must have the same length as x' ) 293 294 # Drop NaN values 295 if dropna: 296 isna = np.isnan(x*y*w) 297 x = x[~isna] 298 y = y[~isna] 299 w = w[~isna] 300 301 # Differences and ratios used repeatedly 302 diff = y - x 303 absdiff = np.abs( y - x ) 304 # Ignore divide by zero and 0/0 while dividing 305 old_settings = np.seterr(divide='ignore',invalid='ignore') 306 ratio = y/x 307 log10ratio = np.log10(ratio) 308 np.seterr(**old_settings) 309 310 # Number of data points 311 self.count = self.n = len(x) 312 313 # Means, medians, and standard deviations 314 self.xmean = np.mean(x) 315 self.ymean = np.mean(y) 316 self.xmedian = np.median(x) 317 self.ymedian = np.median(y) 318 self.xstd = np.std(x) 319 self.ystd = np.std(y) 320 321 # Save values for use later 322 self._x = x 323 self._y = y 324 self._w = w 325 326 # Mean and mean absolute differences 327 self.mean_difference = self.md = self.ymean - self.xmean 328 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 329 self.std_difference = self.stdd = np.std( diff ) 330 331 # Relative and standardized differences 332 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 333 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 334 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 335 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 336 337 # Mean and median relative differences 338 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 339 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 340 341 # Median and median absolute differences 342 self.median_difference = self.medd = np.median( diff ) 343 self.median_absolute_difference = self.medad = np.median( absdiff ) 344 345 # Relative median differences 346 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 347 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 348 349 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 350 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 351 352 # Mean and mean absolute log ratio 353 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 354 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 355 self.std_log10_ratio = self.stdlr= np.std( log10ratio ) 356 357 # Median and median absolute log ratio 358 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 359 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 360 361 # RMS difference 362 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 363 # RMS log ratio 364 self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 ))) 365 366 # Covariance, correlation 367 self.covariance = np.cov(x,y)[0][1] 368 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 369 np.corrcoef(x,y)[0][1] 370 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 371 self.R2 = self.r2 = self.R**2 372 373 def __getitem__(self,key): 374 '''Accesses attribute values via object['key']''' 375 return getattr(self,key) 376 377 def fitline(self,method='sma',intercept=True,**kwargs): 378 '''Compute bivariate line fit 379 380 Parameters 381 ---------- 382 method : str 383 line fitting method: sma (default), ols, wls, York, sen, siegel 384 intercept : bool 385 defines whether non-zero intercept should be fitted 386 **kwargs 387 passed to `acgc.stats.sma` (e.g. robust=True) 388 389 Returns 390 ------- 391 result : dict 392 dictionary with keys: 393 - slope (float) 394 slope of fitted line 395 - intercept (float) 396 intercept of fitted line 397 - fittedvalues (array (N,)) 398 values on fit line 399 - residuals (array (N,)) 400 residual from fit line 401 ''' 402 403 fitintercept = intercept 404 405 if method.lower()=='sma': 406 fit = sma( self._x, 407 self._y, 408 self._w, 409 intercept=fitintercept, 410 **kwargs) 411 slope = fit['slope'] 412 intercept= fit['intercept'] 413 414 elif method.lower()=='ols': 415 if fitintercept: 416 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 417 self._y, rcond=None ) 418 else: 419 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 420 slope = ols[0][0] 421 intercept = ols[0][1] 422 423 elif method.lower() in ['theil','sen','theilsen']: 424 fitintercept = True 425 fit = sen( self._x, 426 self._y, 427 **kwargs) 428 slope = fit.slope 429 intercept = fit.intercept 430 431 elif method.lower()=='siegel': 432 fitintercept = True 433 siegel = stats.siegelslopes( self._x, 434 self._y ) 435 slope = siegel.slope 436 intercept = siegel.intercept 437 438 elif method.lower()=='wls': 439 raise NotImplementedError('WLS regression not implemented yet') 440 441 elif method.lower()=='york': 442 raise NotImplementedError('York regression not implemented yet') 443 444 else: 445 raise ValueError('Undefined method '+method) 446 447 line = dict( slope = slope, 448 intercept = intercept, 449 fittedvalues = slope * self._x + intercept, 450 residuals = self._y - ( slope * self._x + intercept ), 451 method = method, 452 fitintercept = fitintercept ) 453 454 return line 455 456 def slope(self,method='sma',intercept=True,**kwargs): 457 '''Compute slope of bivariate line fit 458 459 Parameters 460 ---------- 461 method : str 462 line fitting method: sma (default), ols, wls 463 intercept : bool 464 defines whether non-zero intercept should be fitted 465 **kwargs 466 passed to `fitline` 467 468 Returns 469 ------- 470 slope : float 471 value of y intercept 472 ''' 473 return self.fitline(method,intercept,**kwargs)['slope'] 474 475 def intercept(self,method='sma',intercept=True,**kwargs): 476 '''Compute intercept of bivariate line fit 477 478 Parameters 479 ---------- 480 method : str 481 line fitting method: sma (default) or ols 482 intercept : bool 483 defines whether non-zero intercept should be fitted 484 **kwargs 485 passed to `fitline` 486 487 Returns 488 ------- 489 intercept : float 490 value of y intercept 491 ''' 492 return self.fitline(method,intercept,**kwargs)['intercept'] 493 494 def _expand_variables(self,variables): 495 '''Expand special strings into a list of variables 496 497 Parameter 498 --------- 499 variables : list or str, default='common' 500 Special strings ("all","common") will be expanded to a list of variables 501 list arguments will not be modified 502 503 Returns 504 ------- 505 list 506 variable names 507 ''' 508 if variables is None: 509 variables='common' 510 if variables=='all': 511 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 512 'MLR','MALR', 513 'MedD','MedAD','RMedD','RMedAD','MedRD', 514 'MedLR','MedALR', 515 'NMBF','NMAEF','RMSD','cov', 516 'R','R2','spearmanr','slope','intercept', 517 'fitline','n'] 518 elif variables=='common': 519 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n'] 520 if not isinstance(variables,list): 521 raise ValueError( 522 'variables must be a list, None, or one of these strings: "all","common"') 523 524 return variables 525 526 def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 527 '''Summarize bivariate statistics into a dict 528 529 Parameters 530 ---------- 531 vars : list or str, default='common' 532 names of attribute variables to include in summary 533 names are case insensitive 534 The following strings are also accepted in place of a list 535 "all" (displays all variables) 536 "common" (displays all measures of mean difference) 537 fitline_kw : dict, default=None 538 keywords passed to `fitline` 539 floatformat_fiteqn : str, default=floatformat 540 format specifier for slope and intercept (a,b) in y = a x + b 541 542 Returns 543 ------- 544 summary : dict 545 names and values of variables 546 ''' 547 548 # List of variables 549 variables = self._expand_variables(variables) 550 551 if fitline_kw is None: 552 fitline_kw = {'method':'sma', 553 'intercept':True} 554 555 # Construct the dict 556 summary = {} 557 for v in variables: 558 if v in ['slope','intercept']: 559 # These variables are object methods 560 func = getattr(self,v) 561 value = func(**fitline_kw) 562 elif v == 'fitline': 563 line = self.fitline(**fitline_kw) 564 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 565 else: 566 # Retrieve values 567 value = getattr(self,v.lower()) 568 569 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 570 summary[v] = value 571 572 return summary 573 574 def summary(self, variables=None, fitline_kw=None, 575 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 576 stringlength=None ): 577 '''Summarize bivariate statistics 578 579 Parameters 580 ---------- 581 vars : list or str, default='common' 582 names of attribute variables to include in summary 583 names are case insensitive 584 The following strings are also accepted in place of a list 585 "all" (displays all variables) 586 "common" (displays all measures of mean difference) 587 fitline_kw : dict, default=None 588 keywords passed to `fitline` 589 intformat : str, default='{:d}' 590 format specifier for integer values 591 floatformat : str, default='{:.4f}' 592 format specifier for floating point values 593 floatformat_fiteqn : str, default=floatformat 594 format specifier for slope and intercept (a,b) in y = a x + b 595 stringlength : int, default=None 596 length of the variables on output 597 default (None) is to use the length of the longest variable name 598 599 Returns 600 ------- 601 summary : str 602 names and values of variables 603 ''' 604 # List of variables 605 variables = self._expand_variables(variables) 606 607 if floatformat_fiteqn is None: 608 floatformat_fiteqn = floatformat 609 if stringlength is None: 610 stringlength = np.max([len(v) for v in variables]) 611 stringformat = '{:'+str(stringlength)+'s}' 612 613 # Get a dict containing the needed variables 614 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 615 616 # Extract length of the float numbers from floatformat 617 # import re 618 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 619 # floatformat )[0] ) ).astype(int) 620 621 # summary = (stringformat+'{:>10s}').format('Variable','Value') 622 summarytext = '' 623 for k,v in summarydict.items(): 624 vstr = _number2str(v,intformat,floatformat) 625 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 626 627 return summarytext 628 629 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 630 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 631 loc=None, loc_units='axes', 632 **kwargs): 633 '''Display bivariate statistics as a table inset on a plot axis 634 635 Parameters 636 ---------- 637 ax : matplotlib.Figure.Axis 638 axis where the table will be displayed 639 variables : list or str, default='common' 640 names of attribute variables to include in summary 641 names are case insensitive 642 The following strings are also accepted in place of a list 643 "all" (displays all variables) 644 "common" (displays all measures of mean difference) 645 fitline_kw : dict, default=None 646 keywords passed to `fitline` 647 intformat : str, default='{:d}' 648 format specifier for integer values 649 floatformat : str, default='{:.3f}' 650 format specifier for floating point values 651 floatformat_fiteqn : str, default=floatformat 652 format specifier for slope and intercept (a,b) in y = a x + b 653 loc : tuple (x0,y0), default=(0.85, 0.05) 654 location on the axis where the table will be drawn 655 can be in data units or axes units [0-1] 656 loc_units : {'axes' (default), 'data'} 657 specifies whether loc has 'data' units or 'axes' units [0-1] 658 659 Returns 660 ------- 661 text1, text2 : matplotlib text object 662 Artist for the two text boxes 663 ''' 664 # List of variables 665 variables = self._expand_variables(variables) 666 667 if floatformat_fiteqn is None: 668 floatformat_fiteqn = floatformat 669 670 # Default location in lower right corner 671 if loc is None: 672 loc = (0.8,0.05) 673 674 # Coordinates for loc 675 if loc_units.lower()=='data': 676 coord=ax.transData 677 elif loc_units.lower() in ['axes','axis']: 678 coord=ax.transAxes 679 else: 680 raise ValueError('Display units should be "Data" or "Axes"') 681 682 # Get a dict containing the needed variables 683 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 684 685 # Column of label text 686 label_text = '\n'.join([_texify_name(key) 687 for key in summarydict]) 688 # Column of value text 689 value_text = '\n'.join([_number2str(v,intformat,floatformat) 690 for v in summarydict.values()]) 691 692 # Check if horizontal alignment keyword is used 693 ha='' 694 try: 695 ha = kwargs['ha'] 696 except KeyError: 697 pass 698 try: 699 ha = kwargs['horizontalalignment'] 700 except KeyError: 701 pass 702 703 # For right alignment, align on values first 704 # Otherwise, align on labels 705 if ha=='right': 706 first_text = value_text 707 second_text = label_text 708 sign = -1 709 else: 710 first_text = label_text 711 second_text = value_text 712 sign = +1 713 714 # Add first column of text 715 t1=ax.text(loc[0],loc[1], 716 first_text, 717 transform=coord, 718 **kwargs 719 ) 720 721 # Get width of first text column 722 bbox = t1.get_window_extent().transformed(coord.inverted()) 723 width = bbox.x1-bbox.x0 724 725 # Add second column of text 726 t2 = ax.text(loc[0]+width*sign,loc[1], 727 second_text, 728 transform=coord, 729 **kwargs 730 ) 731 732 ################################## 733 # Early version of this function using matplotlib.table.table() 734 735 # if isinstance(loc,(tuple,list)): 736 # # Create an inset axis to contain the table 737 # tableaxis = ax.inset_axes(loc) 738 # table_width=1 739 # else: 740 # tableaxis = ax 741 742 # # Display the table on the axis 743 # return mtable.table( 744 # tableaxis, 745 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 746 # rowLabels=[texify_name(key) for key in summarydict], 747 # colWidths=[table_width/2]*2, 748 # edges=edges, 749 # loc=loc, bbox=bbox 750 # ) 751 752 return [t1,t2]
188class BivariateStatistics: 189 '''A suite of common statistics to quantify bivariate relationships 190 191 Class method 'summary' provides a formatted summary of these statistics 192 193 Attributes 194 ---------- 195 count, n : int 196 number of valid (not NaN) data value pairs 197 xmean, ymean : float 198 mean of x and y variables 199 xmedian, ymedian :float 200 median of x and y variables 201 xstd, ystd : float 202 standard deviation of x and y variables 203 mean_difference, md : float 204 ymean - xmean 205 std_difference, stdd : float 206 std( y - x ) 207 mean_absolute_difference, mad : float 208 mean( |y-x| ) 209 relative_mean_difference, rmd : float 210 md / xmean 211 relative_mean_absolute_difference, rmad :float 212 mad / xmean 213 standardized_mean_difference, smd : float 214 md / xstd 215 standardized_mean_absolute_difference, smad : float 216 mad /xstd 217 mean_relative_difference, mrd : float 218 mean(y/x) - 1 219 mean_log10_ratio, mlr : float 220 mean( log10(y/x) ) 221 std_log10_ratio, stdlr : float 222 std( log10(y/x) ) 223 mean_absolute_log10_ratio, malr : float 224 mean( abs( log10(y/x) ) ) 225 median_difference, medd : float 226 median(y-x) 227 median_absolute_difference, medad : float 228 median(|y-x|) 229 relative_median_difference, rmedd : float 230 median(y-x) / xmedian 231 relative_median_absolute_difference, rmedad : float 232 median(|y-x|) / xmedian 233 median_relative_difference, medianrd, medrd : float 234 median(y/x)-1 235 median_log10_ratio, medlr : float 236 median( log10(y/x) ) 237 median_absolute_log10_ratio, medalr : float 238 median( abs( log10(y/x) ) ) 239 normalized_mean_bias_factor, nmbf : float 240 see `nmbf` 241 normalized_mean_absolute_error_factor, nmaef : float 242 see `nmaef` 243 root_mean_square_difference, rmsd : float 244 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 245 root_mean_square_log10_ratio, rmslr : float 246 $\\sqrt{ \\langle \\log_{10}(y/x)^2 \\rangle }$ 247 covariance : float 248 cov(x,y) 249 correlation_pearson, correlation, pearsonr, R, r : float 250 Pearson linear correlation coefficient 251 correlation_spearman, spearmanr : float 252 Spearman, non-parametric rank correlation coefficient 253 R2, r2 : float 254 Linear coefficient of determination, $R^2$ 255 ''' 256 257 def __init__(self,x,y,w=None,dropna=False,data=None): 258 '''Compute suite of bivariate statistics during initialization 259 260 Statistic values are saved in attributes. 261 CAUTION: Weights w are ignored except in SMA fit 262 263 Parameters 264 ---------- 265 x : ndarray or str 266 independent variable values 267 y : ndarray or str 268 dependent variable values, same size as x 269 w : ndarray or str, optional 270 weights for points (x,y), same size as x and y 271 dropna : bool, optional (default=False) 272 drops NaN values from x, y, and w 273 data : dict-like, optional 274 if x, y, or w are str, then they should be keys in data 275 ''' 276 277 # Get values from data if needed 278 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 279 raise ValueError( 'Data argument must be used if x, y, or w is a string') 280 if isinstance(x,str): 281 x = data[x] 282 if isinstance(y,str): 283 y = data[y] 284 if isinstance(w,str): 285 w = data[w] 286 287 #Ensure that x and y have same length 288 if len(x) != len(y): 289 raise ValueError( 'Arguments x and y must have the same length' ) 290 if w is None: 291 w = np.ones_like(x) 292 if len(w) != len(x): 293 raise ValueError( 'Argument w (if present) must have the same length as x' ) 294 295 # Drop NaN values 296 if dropna: 297 isna = np.isnan(x*y*w) 298 x = x[~isna] 299 y = y[~isna] 300 w = w[~isna] 301 302 # Differences and ratios used repeatedly 303 diff = y - x 304 absdiff = np.abs( y - x ) 305 # Ignore divide by zero and 0/0 while dividing 306 old_settings = np.seterr(divide='ignore',invalid='ignore') 307 ratio = y/x 308 log10ratio = np.log10(ratio) 309 np.seterr(**old_settings) 310 311 # Number of data points 312 self.count = self.n = len(x) 313 314 # Means, medians, and standard deviations 315 self.xmean = np.mean(x) 316 self.ymean = np.mean(y) 317 self.xmedian = np.median(x) 318 self.ymedian = np.median(y) 319 self.xstd = np.std(x) 320 self.ystd = np.std(y) 321 322 # Save values for use later 323 self._x = x 324 self._y = y 325 self._w = w 326 327 # Mean and mean absolute differences 328 self.mean_difference = self.md = self.ymean - self.xmean 329 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 330 self.std_difference = self.stdd = np.std( diff ) 331 332 # Relative and standardized differences 333 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 334 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 335 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 336 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 337 338 # Mean and median relative differences 339 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 340 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 341 342 # Median and median absolute differences 343 self.median_difference = self.medd = np.median( diff ) 344 self.median_absolute_difference = self.medad = np.median( absdiff ) 345 346 # Relative median differences 347 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 348 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 349 350 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 351 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 352 353 # Mean and mean absolute log ratio 354 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 355 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 356 self.std_log10_ratio = self.stdlr= np.std( log10ratio ) 357 358 # Median and median absolute log ratio 359 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 360 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 361 362 # RMS difference 363 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 364 # RMS log ratio 365 self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 ))) 366 367 # Covariance, correlation 368 self.covariance = np.cov(x,y)[0][1] 369 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 370 np.corrcoef(x,y)[0][1] 371 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 372 self.R2 = self.r2 = self.R**2 373 374 def __getitem__(self,key): 375 '''Accesses attribute values via object['key']''' 376 return getattr(self,key) 377 378 def fitline(self,method='sma',intercept=True,**kwargs): 379 '''Compute bivariate line fit 380 381 Parameters 382 ---------- 383 method : str 384 line fitting method: sma (default), ols, wls, York, sen, siegel 385 intercept : bool 386 defines whether non-zero intercept should be fitted 387 **kwargs 388 passed to `acgc.stats.sma` (e.g. robust=True) 389 390 Returns 391 ------- 392 result : dict 393 dictionary with keys: 394 - slope (float) 395 slope of fitted line 396 - intercept (float) 397 intercept of fitted line 398 - fittedvalues (array (N,)) 399 values on fit line 400 - residuals (array (N,)) 401 residual from fit line 402 ''' 403 404 fitintercept = intercept 405 406 if method.lower()=='sma': 407 fit = sma( self._x, 408 self._y, 409 self._w, 410 intercept=fitintercept, 411 **kwargs) 412 slope = fit['slope'] 413 intercept= fit['intercept'] 414 415 elif method.lower()=='ols': 416 if fitintercept: 417 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 418 self._y, rcond=None ) 419 else: 420 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 421 slope = ols[0][0] 422 intercept = ols[0][1] 423 424 elif method.lower() in ['theil','sen','theilsen']: 425 fitintercept = True 426 fit = sen( self._x, 427 self._y, 428 **kwargs) 429 slope = fit.slope 430 intercept = fit.intercept 431 432 elif method.lower()=='siegel': 433 fitintercept = True 434 siegel = stats.siegelslopes( self._x, 435 self._y ) 436 slope = siegel.slope 437 intercept = siegel.intercept 438 439 elif method.lower()=='wls': 440 raise NotImplementedError('WLS regression not implemented yet') 441 442 elif method.lower()=='york': 443 raise NotImplementedError('York regression not implemented yet') 444 445 else: 446 raise ValueError('Undefined method '+method) 447 448 line = dict( slope = slope, 449 intercept = intercept, 450 fittedvalues = slope * self._x + intercept, 451 residuals = self._y - ( slope * self._x + intercept ), 452 method = method, 453 fitintercept = fitintercept ) 454 455 return line 456 457 def slope(self,method='sma',intercept=True,**kwargs): 458 '''Compute slope of bivariate line fit 459 460 Parameters 461 ---------- 462 method : str 463 line fitting method: sma (default), ols, wls 464 intercept : bool 465 defines whether non-zero intercept should be fitted 466 **kwargs 467 passed to `fitline` 468 469 Returns 470 ------- 471 slope : float 472 value of y intercept 473 ''' 474 return self.fitline(method,intercept,**kwargs)['slope'] 475 476 def intercept(self,method='sma',intercept=True,**kwargs): 477 '''Compute intercept of bivariate line fit 478 479 Parameters 480 ---------- 481 method : str 482 line fitting method: sma (default) or ols 483 intercept : bool 484 defines whether non-zero intercept should be fitted 485 **kwargs 486 passed to `fitline` 487 488 Returns 489 ------- 490 intercept : float 491 value of y intercept 492 ''' 493 return self.fitline(method,intercept,**kwargs)['intercept'] 494 495 def _expand_variables(self,variables): 496 '''Expand special strings into a list of variables 497 498 Parameter 499 --------- 500 variables : list or str, default='common' 501 Special strings ("all","common") will be expanded to a list of variables 502 list arguments will not be modified 503 504 Returns 505 ------- 506 list 507 variable names 508 ''' 509 if variables is None: 510 variables='common' 511 if variables=='all': 512 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 513 'MLR','MALR', 514 'MedD','MedAD','RMedD','RMedAD','MedRD', 515 'MedLR','MedALR', 516 'NMBF','NMAEF','RMSD','cov', 517 'R','R2','spearmanr','slope','intercept', 518 'fitline','n'] 519 elif variables=='common': 520 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope','n'] 521 if not isinstance(variables,list): 522 raise ValueError( 523 'variables must be a list, None, or one of these strings: "all","common"') 524 525 return variables 526 527 def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 528 '''Summarize bivariate statistics into a dict 529 530 Parameters 531 ---------- 532 vars : list or str, default='common' 533 names of attribute variables to include in summary 534 names are case insensitive 535 The following strings are also accepted in place of a list 536 "all" (displays all variables) 537 "common" (displays all measures of mean difference) 538 fitline_kw : dict, default=None 539 keywords passed to `fitline` 540 floatformat_fiteqn : str, default=floatformat 541 format specifier for slope and intercept (a,b) in y = a x + b 542 543 Returns 544 ------- 545 summary : dict 546 names and values of variables 547 ''' 548 549 # List of variables 550 variables = self._expand_variables(variables) 551 552 if fitline_kw is None: 553 fitline_kw = {'method':'sma', 554 'intercept':True} 555 556 # Construct the dict 557 summary = {} 558 for v in variables: 559 if v in ['slope','intercept']: 560 # These variables are object methods 561 func = getattr(self,v) 562 value = func(**fitline_kw) 563 elif v == 'fitline': 564 line = self.fitline(**fitline_kw) 565 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 566 else: 567 # Retrieve values 568 value = getattr(self,v.lower()) 569 570 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 571 summary[v] = value 572 573 return summary 574 575 def summary(self, variables=None, fitline_kw=None, 576 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 577 stringlength=None ): 578 '''Summarize bivariate statistics 579 580 Parameters 581 ---------- 582 vars : list or str, default='common' 583 names of attribute variables to include in summary 584 names are case insensitive 585 The following strings are also accepted in place of a list 586 "all" (displays all variables) 587 "common" (displays all measures of mean difference) 588 fitline_kw : dict, default=None 589 keywords passed to `fitline` 590 intformat : str, default='{:d}' 591 format specifier for integer values 592 floatformat : str, default='{:.4f}' 593 format specifier for floating point values 594 floatformat_fiteqn : str, default=floatformat 595 format specifier for slope and intercept (a,b) in y = a x + b 596 stringlength : int, default=None 597 length of the variables on output 598 default (None) is to use the length of the longest variable name 599 600 Returns 601 ------- 602 summary : str 603 names and values of variables 604 ''' 605 # List of variables 606 variables = self._expand_variables(variables) 607 608 if floatformat_fiteqn is None: 609 floatformat_fiteqn = floatformat 610 if stringlength is None: 611 stringlength = np.max([len(v) for v in variables]) 612 stringformat = '{:'+str(stringlength)+'s}' 613 614 # Get a dict containing the needed variables 615 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 616 617 # Extract length of the float numbers from floatformat 618 # import re 619 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 620 # floatformat )[0] ) ).astype(int) 621 622 # summary = (stringformat+'{:>10s}').format('Variable','Value') 623 summarytext = '' 624 for k,v in summarydict.items(): 625 vstr = _number2str(v,intformat,floatformat) 626 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 627 628 return summarytext 629 630 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 631 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 632 loc=None, loc_units='axes', 633 **kwargs): 634 '''Display bivariate statistics as a table inset on a plot axis 635 636 Parameters 637 ---------- 638 ax : matplotlib.Figure.Axis 639 axis where the table will be displayed 640 variables : list or str, default='common' 641 names of attribute variables to include in summary 642 names are case insensitive 643 The following strings are also accepted in place of a list 644 "all" (displays all variables) 645 "common" (displays all measures of mean difference) 646 fitline_kw : dict, default=None 647 keywords passed to `fitline` 648 intformat : str, default='{:d}' 649 format specifier for integer values 650 floatformat : str, default='{:.3f}' 651 format specifier for floating point values 652 floatformat_fiteqn : str, default=floatformat 653 format specifier for slope and intercept (a,b) in y = a x + b 654 loc : tuple (x0,y0), default=(0.85, 0.05) 655 location on the axis where the table will be drawn 656 can be in data units or axes units [0-1] 657 loc_units : {'axes' (default), 'data'} 658 specifies whether loc has 'data' units or 'axes' units [0-1] 659 660 Returns 661 ------- 662 text1, text2 : matplotlib text object 663 Artist for the two text boxes 664 ''' 665 # List of variables 666 variables = self._expand_variables(variables) 667 668 if floatformat_fiteqn is None: 669 floatformat_fiteqn = floatformat 670 671 # Default location in lower right corner 672 if loc is None: 673 loc = (0.8,0.05) 674 675 # Coordinates for loc 676 if loc_units.lower()=='data': 677 coord=ax.transData 678 elif loc_units.lower() in ['axes','axis']: 679 coord=ax.transAxes 680 else: 681 raise ValueError('Display units should be "Data" or "Axes"') 682 683 # Get a dict containing the needed variables 684 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 685 686 # Column of label text 687 label_text = '\n'.join([_texify_name(key) 688 for key in summarydict]) 689 # Column of value text 690 value_text = '\n'.join([_number2str(v,intformat,floatformat) 691 for v in summarydict.values()]) 692 693 # Check if horizontal alignment keyword is used 694 ha='' 695 try: 696 ha = kwargs['ha'] 697 except KeyError: 698 pass 699 try: 700 ha = kwargs['horizontalalignment'] 701 except KeyError: 702 pass 703 704 # For right alignment, align on values first 705 # Otherwise, align on labels 706 if ha=='right': 707 first_text = value_text 708 second_text = label_text 709 sign = -1 710 else: 711 first_text = label_text 712 second_text = value_text 713 sign = +1 714 715 # Add first column of text 716 t1=ax.text(loc[0],loc[1], 717 first_text, 718 transform=coord, 719 **kwargs 720 ) 721 722 # Get width of first text column 723 bbox = t1.get_window_extent().transformed(coord.inverted()) 724 width = bbox.x1-bbox.x0 725 726 # Add second column of text 727 t2 = ax.text(loc[0]+width*sign,loc[1], 728 second_text, 729 transform=coord, 730 **kwargs 731 ) 732 733 ################################## 734 # Early version of this function using matplotlib.table.table() 735 736 # if isinstance(loc,(tuple,list)): 737 # # Create an inset axis to contain the table 738 # tableaxis = ax.inset_axes(loc) 739 # table_width=1 740 # else: 741 # tableaxis = ax 742 743 # # Display the table on the axis 744 # return mtable.table( 745 # tableaxis, 746 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 747 # rowLabels=[texify_name(key) for key in summarydict], 748 # colWidths=[table_width/2]*2, 749 # edges=edges, 750 # loc=loc, bbox=bbox 751 # ) 752 753 return [t1,t2]
A suite of common statistics to quantify bivariate relationships
Class method 'summary' provides a formatted summary of these statistics
Attributes
- count, n (int): number of valid (not NaN) data value pairs
- xmean, ymean (float): mean of x and y variables
- xmedian, ymedian (float): median of x and y variables
- xstd, ystd (float): standard deviation of x and y variables
- mean_difference, md (float): ymean - xmean
- std_difference, stdd (float): std( y - x )
- mean_absolute_difference, mad (float): mean( |y-x| )
- relative_mean_difference, rmd (float): md / xmean
- relative_mean_absolute_difference, rmad (float): mad / xmean
- standardized_mean_difference, smd (float): md / xstd
- standardized_mean_absolute_difference, smad (float): mad /xstd
- mean_relative_difference, mrd (float): mean(y/x) - 1
- mean_log10_ratio, mlr (float): mean( log10(y/x) )
- std_log10_ratio, stdlr (float): std( log10(y/x) )
- mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
- median_difference, medd (float): median(y-x)
- median_absolute_difference, medad (float): median(|y-x|)
- relative_median_difference, rmedd (float): median(y-x) / xmedian
- relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
- median_relative_difference, medianrd, medrd (float): median(y/x)-1
- median_log10_ratio, medlr (float): median( log10(y/x) )
- median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
- normalized_mean_bias_factor, nmbf (float):
see
nmbf
- normalized_mean_absolute_error_factor, nmaef (float):
see
nmaef
- root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
- root_mean_square_log10_ratio, rmslr (float): $\sqrt{ \langle \log_{10}(y/x)^2 \rangle }$
- covariance (float): cov(x,y)
- correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
- correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
- R2, r2 (float): Linear coefficient of determination, $R^2$
257 def __init__(self,x,y,w=None,dropna=False,data=None): 258 '''Compute suite of bivariate statistics during initialization 259 260 Statistic values are saved in attributes. 261 CAUTION: Weights w are ignored except in SMA fit 262 263 Parameters 264 ---------- 265 x : ndarray or str 266 independent variable values 267 y : ndarray or str 268 dependent variable values, same size as x 269 w : ndarray or str, optional 270 weights for points (x,y), same size as x and y 271 dropna : bool, optional (default=False) 272 drops NaN values from x, y, and w 273 data : dict-like, optional 274 if x, y, or w are str, then they should be keys in data 275 ''' 276 277 # Get values from data if needed 278 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 279 raise ValueError( 'Data argument must be used if x, y, or w is a string') 280 if isinstance(x,str): 281 x = data[x] 282 if isinstance(y,str): 283 y = data[y] 284 if isinstance(w,str): 285 w = data[w] 286 287 #Ensure that x and y have same length 288 if len(x) != len(y): 289 raise ValueError( 'Arguments x and y must have the same length' ) 290 if w is None: 291 w = np.ones_like(x) 292 if len(w) != len(x): 293 raise ValueError( 'Argument w (if present) must have the same length as x' ) 294 295 # Drop NaN values 296 if dropna: 297 isna = np.isnan(x*y*w) 298 x = x[~isna] 299 y = y[~isna] 300 w = w[~isna] 301 302 # Differences and ratios used repeatedly 303 diff = y - x 304 absdiff = np.abs( y - x ) 305 # Ignore divide by zero and 0/0 while dividing 306 old_settings = np.seterr(divide='ignore',invalid='ignore') 307 ratio = y/x 308 log10ratio = np.log10(ratio) 309 np.seterr(**old_settings) 310 311 # Number of data points 312 self.count = self.n = len(x) 313 314 # Means, medians, and standard deviations 315 self.xmean = np.mean(x) 316 self.ymean = np.mean(y) 317 self.xmedian = np.median(x) 318 self.ymedian = np.median(y) 319 self.xstd = np.std(x) 320 self.ystd = np.std(y) 321 322 # Save values for use later 323 self._x = x 324 self._y = y 325 self._w = w 326 327 # Mean and mean absolute differences 328 self.mean_difference = self.md = self.ymean - self.xmean 329 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 330 self.std_difference = self.stdd = np.std( diff ) 331 332 # Relative and standardized differences 333 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 334 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 335 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 336 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 337 338 # Mean and median relative differences 339 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 340 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 341 342 # Median and median absolute differences 343 self.median_difference = self.medd = np.median( diff ) 344 self.median_absolute_difference = self.medad = np.median( absdiff ) 345 346 # Relative median differences 347 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 348 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 349 350 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 351 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 352 353 # Mean and mean absolute log ratio 354 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 355 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 356 self.std_log10_ratio = self.stdlr= np.std( log10ratio ) 357 358 # Median and median absolute log ratio 359 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 360 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 361 362 # RMS difference 363 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 364 # RMS log ratio 365 self.root_mean_square_log10_ratio = self.rmslr = np.sqrt( np.mean( np.power( log10ratio, 2 ))) 366 367 # Covariance, correlation 368 self.covariance = np.cov(x,y)[0][1] 369 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 370 np.corrcoef(x,y)[0][1] 371 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 372 self.R2 = self.r2 = self.R**2
Compute suite of bivariate statistics during initialization
Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit
Parameters
- x (ndarray or str): independent variable values
- y (ndarray or str): dependent variable values, same size as x
- w (ndarray or str, optional): weights for points (x,y), same size as x and y
- dropna (bool, optional (default=False)): drops NaN values from x, y, and w
- data (dict-like, optional): if x, y, or w are str, then they should be keys in data
378 def fitline(self,method='sma',intercept=True,**kwargs): 379 '''Compute bivariate line fit 380 381 Parameters 382 ---------- 383 method : str 384 line fitting method: sma (default), ols, wls, York, sen, siegel 385 intercept : bool 386 defines whether non-zero intercept should be fitted 387 **kwargs 388 passed to `acgc.stats.sma` (e.g. robust=True) 389 390 Returns 391 ------- 392 result : dict 393 dictionary with keys: 394 - slope (float) 395 slope of fitted line 396 - intercept (float) 397 intercept of fitted line 398 - fittedvalues (array (N,)) 399 values on fit line 400 - residuals (array (N,)) 401 residual from fit line 402 ''' 403 404 fitintercept = intercept 405 406 if method.lower()=='sma': 407 fit = sma( self._x, 408 self._y, 409 self._w, 410 intercept=fitintercept, 411 **kwargs) 412 slope = fit['slope'] 413 intercept= fit['intercept'] 414 415 elif method.lower()=='ols': 416 if fitintercept: 417 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 418 self._y, rcond=None ) 419 else: 420 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 421 slope = ols[0][0] 422 intercept = ols[0][1] 423 424 elif method.lower() in ['theil','sen','theilsen']: 425 fitintercept = True 426 fit = sen( self._x, 427 self._y, 428 **kwargs) 429 slope = fit.slope 430 intercept = fit.intercept 431 432 elif method.lower()=='siegel': 433 fitintercept = True 434 siegel = stats.siegelslopes( self._x, 435 self._y ) 436 slope = siegel.slope 437 intercept = siegel.intercept 438 439 elif method.lower()=='wls': 440 raise NotImplementedError('WLS regression not implemented yet') 441 442 elif method.lower()=='york': 443 raise NotImplementedError('York regression not implemented yet') 444 445 else: 446 raise ValueError('Undefined method '+method) 447 448 line = dict( slope = slope, 449 intercept = intercept, 450 fittedvalues = slope * self._x + intercept, 451 residuals = self._y - ( slope * self._x + intercept ), 452 method = method, 453 fitintercept = fitintercept ) 454 455 return line
Compute bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
acgc.stats.sma
(e.g. robust=True)
Returns
- result (dict):
dictionary with keys:
- slope (float) slope of fitted line
- intercept (float) intercept of fitted line
- fittedvalues (array (N,)) values on fit line
- residuals (array (N,)) residual from fit line
457 def slope(self,method='sma',intercept=True,**kwargs): 458 '''Compute slope of bivariate line fit 459 460 Parameters 461 ---------- 462 method : str 463 line fitting method: sma (default), ols, wls 464 intercept : bool 465 defines whether non-zero intercept should be fitted 466 **kwargs 467 passed to `fitline` 468 469 Returns 470 ------- 471 slope : float 472 value of y intercept 473 ''' 474 return self.fitline(method,intercept,**kwargs)['slope']
Compute slope of bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- slope (float): value of y intercept
476 def intercept(self,method='sma',intercept=True,**kwargs): 477 '''Compute intercept of bivariate line fit 478 479 Parameters 480 ---------- 481 method : str 482 line fitting method: sma (default) or ols 483 intercept : bool 484 defines whether non-zero intercept should be fitted 485 **kwargs 486 passed to `fitline` 487 488 Returns 489 ------- 490 intercept : float 491 value of y intercept 492 ''' 493 return self.fitline(method,intercept,**kwargs)['intercept']
Compute intercept of bivariate line fit
Parameters
- method (str): line fitting method: sma (default) or ols
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- intercept (float): value of y intercept
527 def summary_dict(self, variables=None, fitline_kw=None, floatformat_fiteqn='{:.3f}'): 528 '''Summarize bivariate statistics into a dict 529 530 Parameters 531 ---------- 532 vars : list or str, default='common' 533 names of attribute variables to include in summary 534 names are case insensitive 535 The following strings are also accepted in place of a list 536 "all" (displays all variables) 537 "common" (displays all measures of mean difference) 538 fitline_kw : dict, default=None 539 keywords passed to `fitline` 540 floatformat_fiteqn : str, default=floatformat 541 format specifier for slope and intercept (a,b) in y = a x + b 542 543 Returns 544 ------- 545 summary : dict 546 names and values of variables 547 ''' 548 549 # List of variables 550 variables = self._expand_variables(variables) 551 552 if fitline_kw is None: 553 fitline_kw = {'method':'sma', 554 'intercept':True} 555 556 # Construct the dict 557 summary = {} 558 for v in variables: 559 if v in ['slope','intercept']: 560 # These variables are object methods 561 func = getattr(self,v) 562 value = func(**fitline_kw) 563 elif v == 'fitline': 564 line = self.fitline(**fitline_kw) 565 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 566 else: 567 # Retrieve values 568 value = getattr(self,v.lower()) 569 570 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 571 summary[v] = value 572 573 return summary
Summarize bivariate statistics into a dict
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
Returns
- summary (dict): names and values of variables
575 def summary(self, variables=None, fitline_kw=None, 576 intformat='{:d}', floatformat='{:.4f}', floatformat_fiteqn=None, 577 stringlength=None ): 578 '''Summarize bivariate statistics 579 580 Parameters 581 ---------- 582 vars : list or str, default='common' 583 names of attribute variables to include in summary 584 names are case insensitive 585 The following strings are also accepted in place of a list 586 "all" (displays all variables) 587 "common" (displays all measures of mean difference) 588 fitline_kw : dict, default=None 589 keywords passed to `fitline` 590 intformat : str, default='{:d}' 591 format specifier for integer values 592 floatformat : str, default='{:.4f}' 593 format specifier for floating point values 594 floatformat_fiteqn : str, default=floatformat 595 format specifier for slope and intercept (a,b) in y = a x + b 596 stringlength : int, default=None 597 length of the variables on output 598 default (None) is to use the length of the longest variable name 599 600 Returns 601 ------- 602 summary : str 603 names and values of variables 604 ''' 605 # List of variables 606 variables = self._expand_variables(variables) 607 608 if floatformat_fiteqn is None: 609 floatformat_fiteqn = floatformat 610 if stringlength is None: 611 stringlength = np.max([len(v) for v in variables]) 612 stringformat = '{:'+str(stringlength)+'s}' 613 614 # Get a dict containing the needed variables 615 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 616 617 # Extract length of the float numbers from floatformat 618 # import re 619 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 620 # floatformat )[0] ) ).astype(int) 621 622 # summary = (stringformat+'{:>10s}').format('Variable','Value') 623 summarytext = '' 624 for k,v in summarydict.items(): 625 vstr = _number2str(v,intformat,floatformat) 626 summarytext += (stringformat+' = {:s}\n').format(k,vstr) 627 628 return summarytext
Summarize bivariate statistics
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline
- intformat : str, default='{ (d}'): format specifier for integer values
- floatformat : str, default='{ (.4f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
Returns
- summary (str): names and values of variables
630 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 631 intformat='{:d}', floatformat='{:.3f}', floatformat_fiteqn=None, 632 loc=None, loc_units='axes', 633 **kwargs): 634 '''Display bivariate statistics as a table inset on a plot axis 635 636 Parameters 637 ---------- 638 ax : matplotlib.Figure.Axis 639 axis where the table will be displayed 640 variables : list or str, default='common' 641 names of attribute variables to include in summary 642 names are case insensitive 643 The following strings are also accepted in place of a list 644 "all" (displays all variables) 645 "common" (displays all measures of mean difference) 646 fitline_kw : dict, default=None 647 keywords passed to `fitline` 648 intformat : str, default='{:d}' 649 format specifier for integer values 650 floatformat : str, default='{:.3f}' 651 format specifier for floating point values 652 floatformat_fiteqn : str, default=floatformat 653 format specifier for slope and intercept (a,b) in y = a x + b 654 loc : tuple (x0,y0), default=(0.85, 0.05) 655 location on the axis where the table will be drawn 656 can be in data units or axes units [0-1] 657 loc_units : {'axes' (default), 'data'} 658 specifies whether loc has 'data' units or 'axes' units [0-1] 659 660 Returns 661 ------- 662 text1, text2 : matplotlib text object 663 Artist for the two text boxes 664 ''' 665 # List of variables 666 variables = self._expand_variables(variables) 667 668 if floatformat_fiteqn is None: 669 floatformat_fiteqn = floatformat 670 671 # Default location in lower right corner 672 if loc is None: 673 loc = (0.8,0.05) 674 675 # Coordinates for loc 676 if loc_units.lower()=='data': 677 coord=ax.transData 678 elif loc_units.lower() in ['axes','axis']: 679 coord=ax.transAxes 680 else: 681 raise ValueError('Display units should be "Data" or "Axes"') 682 683 # Get a dict containing the needed variables 684 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 685 686 # Column of label text 687 label_text = '\n'.join([_texify_name(key) 688 for key in summarydict]) 689 # Column of value text 690 value_text = '\n'.join([_number2str(v,intformat,floatformat) 691 for v in summarydict.values()]) 692 693 # Check if horizontal alignment keyword is used 694 ha='' 695 try: 696 ha = kwargs['ha'] 697 except KeyError: 698 pass 699 try: 700 ha = kwargs['horizontalalignment'] 701 except KeyError: 702 pass 703 704 # For right alignment, align on values first 705 # Otherwise, align on labels 706 if ha=='right': 707 first_text = value_text 708 second_text = label_text 709 sign = -1 710 else: 711 first_text = label_text 712 second_text = value_text 713 sign = +1 714 715 # Add first column of text 716 t1=ax.text(loc[0],loc[1], 717 first_text, 718 transform=coord, 719 **kwargs 720 ) 721 722 # Get width of first text column 723 bbox = t1.get_window_extent().transformed(coord.inverted()) 724 width = bbox.x1-bbox.x0 725 726 # Add second column of text 727 t2 = ax.text(loc[0]+width*sign,loc[1], 728 second_text, 729 transform=coord, 730 **kwargs 731 ) 732 733 ################################## 734 # Early version of this function using matplotlib.table.table() 735 736 # if isinstance(loc,(tuple,list)): 737 # # Create an inset axis to contain the table 738 # tableaxis = ax.inset_axes(loc) 739 # table_width=1 740 # else: 741 # tableaxis = ax 742 743 # # Display the table on the axis 744 # return mtable.table( 745 # tableaxis, 746 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 747 # rowLabels=[texify_name(key) for key in summarydict], 748 # colWidths=[table_width/2]*2, 749 # edges=edges, 750 # loc=loc, bbox=bbox 751 # ) 752 753 return [t1,t2]
Display bivariate statistics as a table inset on a plot axis
Parameters
- ax (matplotlib.Figure.Axis): axis where the table will be displayed
- variables (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline
- intformat : str, default='{ (d}'): format specifier for integer values
- floatformat : str, default='{ (.3f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
- loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
- text1, text2 (matplotlib text object): Artist for the two text boxes
22def nmb( x0, x1 ): 23 '''Compute Normalized Mean Bias (NMB) 24 25 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 26 27 Parameters 28 ---------- 29 x0 : array_like 30 reference values 31 x1 : array_like 32 experiment values 33 ''' 34 35 assert (len(x0) == len(x1)), \ 36 "Parameters x0 and x1 must have the same length" 37 38 # Mean values 39 x0_mean = np.mean(x0) 40 x1_mean = np.mean(x1) 41 42 # Metric value 43 return x1_mean / x0_mean - 1
Compute Normalized Mean Bias (NMB)
NMB = ( mean(x1) - mean(x0) ) / mean(x0)
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
45def nmae( x0, x1 ): 46 '''Compute Normalized Mean Absolute Error (NMAE) 47 48 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 49 50 Parameters 51 --------- 52 x0 : array_like 53 reference values 54 x1 : array_like 55 experiment values 56 ''' 57 58 # Mean values 59 x0_mean = np.mean(x0) 60 61 # Mean absolute difference 62 abs_diff = np.mean( np.abs(x1 - x0) ) 63 64 # Metric value 65 return abs_diff / np.abs( x0_mean )
Compute Normalized Mean Absolute Error (NMAE)
NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
68def nmbf( x0, x1 ): 69 '''Compute Normalized Mean Bias Factor (NMBF) 70 71 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 72 73 Parameters 74 ---------- 75 x0 : array_like 76 reference values 77 x1 : array_like 78 experiment values 79 ''' 80 81 # Ensure that arguments have the same length 82 assert (len(x0) == len(x1)), \ 83 "Parameters x0 and x1 must have the same length" 84 85 # Mean values 86 x0_mean = np.mean(x0) 87 x1_mean = np.mean(x1) 88 89 # Metric value 90 if x1_mean >= x0_mean: 91 result = x1_mean / x0_mean - 1 92 else: 93 result= 1 - x0_mean / x1_mean 94 # Equivalent (faster?) implementation 95 #S = (mMean - oMean) / np.abs(mMean - oMean) 96 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 97 98 return result
Compute Normalized Mean Bias Factor (NMBF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
100def nmaef( x0, x1 ): 101 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 102 103 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 104 105 Parameters 106 ---------- 107 x0 : array_like 108 reference values 109 x1 : array_like 110 experiment values 111 ''' 112 113 # Ensure that arguments have the same length 114 assert (len(x0) == len(x1)), \ 115 "Parameters x0 and x1 must have the same length" 116 117 # Mean values 118 x0_mean = np.mean(x0) 119 x1_mean = np.mean(x1) 120 121 # Mean absolute difference 122 abs_diff = np.mean( np.abs(x1 - x0)) 123 124 # Metric value 125 if x1_mean >= x0_mean: 126 result = abs_diff / x0_mean 127 else: 128 result = abs_diff / x1_mean 129 # Equivalent (faster?) implementation 130 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 131 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 132 133 return result
Compute Normalized Mean Absolute Error Factor (NMAEF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values