acgc.stats.bivariate
Bivariate statistics
Statistical measures of relationships between two populations
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3""" Bivariate statistics 4 5Statistical measures of relationships between two populations 6""" 7 8import numpy as np 9from scipy import stats 10from .bivariate_lines import sen, sma, bivariate_line_equation 11# import xarray as xr 12 13__all__ = [ 14 "BivariateStatistics", 15 "nmb", 16 "nmae", 17 "nmbf", 18 "nmaef" 19] 20 21def nmb( x0, x1 ): 22 '''Compute Normalized Mean Bias (NMB) 23 24 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 25 26 Parameters 27 ---------- 28 x0 : array_like 29 reference values 30 x1 : array_like 31 experiment values 32 ''' 33 34 assert (len(x0) == len(x1)), \ 35 "Parameters x0 and x1 must have the same length" 36 37 # Mean values 38 x0_mean = np.mean(x0) 39 x1_mean = np.mean(x1) 40 41 # Metric value 42 return x1_mean / x0_mean - 1 43 44def nmae( x0, x1 ): 45 '''Compute Normalized Mean Absolute Error (NMAE) 46 47 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 48 49 Parameters 50 --------- 51 x0 : array_like 52 reference values 53 x1 : array_like 54 experiment values 55 ''' 56 57 # Mean values 58 x0_mean = np.mean(x0) 59 60 # Mean absolute difference 61 abs_diff = np.mean( np.abs(x1 - x0) ) 62 63 # Metric value 64 return abs_diff / np.abs( x0_mean ) 65 66 67def nmbf( x0, x1 ): 68 '''Compute Normalized Mean Bias Factor (NMBF) 69 70 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 71 72 Parameters 73 ---------- 74 x0 : array_like 75 reference values 76 x1 : array_like 77 experiment values 78 ''' 79 80 # Ensure that arguments have the same length 81 assert (len(x0) == len(x1)), \ 82 "Parameters x0 and x1 must have the same length" 83 84 # Mean values 85 x0_mean = np.mean(x0) 86 x1_mean = np.mean(x1) 87 88 # Metric value 89 if x1_mean >= x0_mean: 90 result = x1_mean / x0_mean - 1 91 else: 92 result= 1 - x0_mean / x1_mean 93 # Equivalent (faster?) implementation 94 #S = (mMean - oMean) / np.abs(mMean - oMean) 95 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 96 97 return result 98 99def nmaef( x0, x1 ): 100 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 101 102 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 103 104 Parameters 105 ---------- 106 x0 : array_like 107 reference values 108 x1 : array_like 109 experiment values 110 ''' 111 112 # Ensure that arguments have the same length 113 assert (len(x0) == len(x1)), \ 114 "Parameters x0 and x1 must have the same length" 115 116 # Mean values 117 x0_mean = np.mean(x0) 118 x1_mean = np.mean(x1) 119 120 # Mean absolute difference 121 abs_diff = np.mean( np.abs(x1 - x0)) 122 123 # Metric value 124 if x1_mean >= x0_mean: 125 result = abs_diff / x0_mean 126 else: 127 result = abs_diff / x1_mean 128 # Equivalent (faster?) implementation 129 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 130 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 131 132 return result 133 134def _texify_name(name): 135 '''Return a LaTex formatted string for some variables 136 137 Parameter 138 --------- 139 name : str 140 141 Returns 142 ------- 143 pretty_name : str 144 ''' 145 if name=='R2': 146 pretty_name = f'$R^2$' 147 elif name=='r2': 148 pretty_name = f'$r^2$' 149 elif name.lower()=='y_ols': 150 pretty_name = r'$y_{\rm OLS}$' 151 elif name.lower()=='y_sma': 152 pretty_name = r'$y_{\rm SMA}$' 153 elif name.lower()=='y_sen': 154 pretty_name = r'$y_{\rm Sen}$' 155 else: 156 pretty_name = name 157 return pretty_name 158 159class BivariateStatistics: 160 '''A suite of common statistics to quantify bivariate relationships 161 162 Class method 'summary' provides a formatted summary of these statistics 163 164 Attributes 165 ---------- 166 xmean, ymean : float 167 mean of x and y variables 168 xmedian, ymedian :float 169 median of x and y variables 170 xstd, ystd : float 171 standard deviation of x and y variables 172 mean_difference, md : float 173 ymean - xmean 174 mean_absolute_difference, mad : float 175 mean( |y-x| ) 176 relative_mean_difference, rmd : float 177 md / xmean 178 relative_mean_absolute_difference, rmad :float 179 mad / xmean 180 standardized_mean_difference, smd : float 181 md / xstd 182 standardized_mean_absolute_difference, smad : float 183 mad /xstd 184 mean_relative_difference, mrd : float 185 mean(y/x) - 1 186 mean_log10_ratio, mlr : float 187 mean( log10(y/x) ) 188 mean_absolute_log10_ratio, malr : float 189 mean( abs( log10(y/x) ) ) 190 median_difference, medd : float 191 median(y-x) 192 median_absolute_difference, medad : float 193 median(|y-x|) 194 relative_median_difference, rmedd : float 195 median(y-x) / xmedian 196 relative_median_absolute_difference, rmedad : float 197 median(|y-x|) / xmedian 198 median_relative_difference, medianrd, medrd : float 199 median(y/x)-1 200 median_log10_ratio, medlr : float 201 median( log10(y/x) ) 202 median_absolute_log10_ratio, medalr : float 203 median( abs( log10(y/x) ) ) 204 normalized_mean_bias_factor, nmbf : float 205 see `nmbf` 206 normalized_mean_absolute_error_factor, nmaef : float 207 see `nmaef` 208 root_mean_square_difference, rmsd : float 209 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 210 covariance : float 211 cov(x,y) 212 correlation_pearson, correlation, pearsonr, R, r : float 213 Pearson linear correlation coefficient 214 correlation_spearman, spearmanr : float 215 Spearman, non-parametric rank correlation coefficient 216 R2, r2 : float 217 Linear coefficient of determination, $R^2$ 218 ''' 219 220 def __init__(self,x,y,w=None,dropna=False,data=None): 221 '''Compute suite of bivariate statistics during initialization 222 223 Statistic values are saved in attributes. 224 CAUTION: Weights w are ignored except in SMA fit 225 226 Parameters 227 ---------- 228 x : ndarray or str 229 independent variable values 230 y : ndarray or str 231 dependent variable values, same size as x 232 w : ndarray or str, optional 233 weights for points (x,y), same size as x and y 234 dropna : bool, optional (default=False) 235 drops NaN values from x, y, and w 236 data : dict-like, optional 237 if x, y, or w are str, then they should be keys in data 238 ''' 239 240 # Get values from data if needed 241 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 242 raise ValueError( 'Data argument must be used if x, y, or w is a string') 243 if isinstance(x,str): 244 x = data[x] 245 if isinstance(y,str): 246 y = data[y] 247 if isinstance(w,str): 248 w = data[w] 249 250 #Ensure that x and y have same length 251 if len(x) != len(y): 252 raise ValueError( 'Arguments x and y must have the same length' ) 253 if w is None: 254 w = np.ones_like(x) 255 if len(w) != len(x): 256 raise ValueError( 'Argument w (if present) must have the same length as x' ) 257 258 # Drop NaN values 259 if dropna: 260 isna = np.isnan(x*y*w) 261 x = x[~isna] 262 y = y[~isna] 263 w = w[~isna] 264 265 diff = y - x 266 absdiff = np.abs( y - x ) 267 # Ignore divide by zero and 0/0 while dividing 268 old_settings = np.seterr(divide='ignore',invalid='ignore') 269 ratio = y/x 270 log10ratio = np.log10(ratio) 271 np.seterr(**old_settings) 272 273 # Means, medians, and standard deviations 274 self.xmean = np.mean(x) 275 self.ymean = np.mean(y) 276 self.xmedian = np.median(x) 277 self.ymedian = np.median(y) 278 self.xstd = np.std(x) 279 self.ystd = np.std(y) 280 281 # Save values for use later 282 self._x = x 283 self._y = y 284 self._w = w 285 286 # Mean and mean absolute differences 287 self.mean_difference = self.md = self.ymean - self.xmean 288 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 289 290 # Relative and standardized differences 291 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 292 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 293 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 294 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 295 296 # Mean and median relative differences 297 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 298 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 299 300 # Median and median absolute differences 301 self.median_difference = self.medd = np.median( diff ) 302 self.median_absolute_difference = self.medad = np.median( absdiff ) 303 304 # Relative median differences 305 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 306 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 307 308 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 309 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 310 311 # Mean and mean absolute log ratio 312 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 313 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 314 315 # Median and median absolute log ratio 316 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 317 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 318 319 # RMS difference 320 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 321 322 # Covariance, correlation 323 self.covariance = np.cov(x,y)[0][1] 324 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 325 np.corrcoef(x,y)[0][1] 326 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 327 self.R2 = self.r2 = self.R**2 328 329 def __getitem__(self,key): 330 '''Accesses attribute values via object['key']''' 331 return getattr(self,key) 332 333 def fitline(self,method='sma',intercept=True,**kwargs): 334 '''Compute bivariate line fit 335 336 Parameters 337 ---------- 338 method : str 339 line fitting method: sma (default), ols, wls, York, sen, siegel 340 intercept : bool 341 defines whether non-zero intercept should be fitted 342 **kwargs 343 passed to `acgc.stats.sma` (e.g. robust=True) 344 345 Returns 346 ------- 347 result : dict 348 dictionary with keys: 349 - slope (float) 350 slope of fitted line 351 - intercept (float) 352 intercept of fitted line 353 - fittedvalues (array (N,)) 354 values on fit line 355 - residuals (array (N,)) 356 residual from fit line 357 ''' 358 359 fitintercept = intercept 360 361 if method.lower()=='sma': 362 fit = sma( self._x, 363 self._y, 364 self._w, 365 intercept=fitintercept, 366 **kwargs) 367 slope = fit['slope'] 368 intercept= fit['intercept'] 369 370 elif method.lower()=='ols': 371 if fitintercept: 372 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 373 self._y, rcond=None ) 374 else: 375 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 376 slope = ols[0][0] 377 intercept = ols[0][1] 378 379 elif method.lower() in ['theil','sen','theilsen']: 380 fitintercept = True 381 fit = sen( self._x, 382 self._y, 383 **kwargs) 384 slope = fit.slope 385 intercept = fit.intercept 386 387 elif method.lower()=='siegel': 388 fitintercept = True 389 siegel = stats.siegelslopes( self._x, 390 self._y ) 391 slope = siegel.slope 392 intercept = siegel.intercept 393 394 elif method.lower()=='wls': 395 raise NotImplementedError('WLS regression not implemented yet') 396 397 elif method.lower()=='york': 398 raise NotImplementedError('York regression not implemented yet') 399 400 else: 401 raise ValueError('Undefined method '+method) 402 403 line = dict( slope = slope, 404 intercept = intercept, 405 fittedvalues = slope * self._x + intercept, 406 residuals = self._y - ( slope * self._x + intercept ), 407 method = method, 408 fitintercept = fitintercept ) 409 410 return line 411 412 def slope(self,method='sma',intercept=True,**kwargs): 413 '''Compute slope of bivariate line fit 414 415 Parameters 416 ---------- 417 method : str 418 line fitting method: sma (default), ols, wls 419 intercept : bool 420 defines whether non-zero intercept should be fitted 421 **kwargs 422 passed to `fitline` 423 424 Returns 425 ------- 426 slope : float 427 value of y intercept 428 ''' 429 return self.fitline(method,intercept,**kwargs)['slope'] 430 431 def intercept(self,method='sma',intercept=True,**kwargs): 432 '''Compute intercept of bivariate line fit 433 434 Parameters 435 ---------- 436 method : str 437 line fitting method: sma (default) or ols 438 intercept : bool 439 defines whether non-zero intercept should be fitted 440 **kwargs 441 passed to `fitline` 442 443 Returns 444 ------- 445 intercept : float 446 value of y intercept 447 ''' 448 return self.fitline(method,intercept,**kwargs)['intercept'] 449 450 def _expand_variables(self,variables): 451 '''Expand special strings into a list of variables 452 453 Parameter 454 --------- 455 variables : list or str, default='common' 456 Special strings ("all","common") will be expanded to a list of variables 457 list arguments will not be modified 458 459 Returns 460 ------- 461 list 462 variable names 463 ''' 464 if variables is None: 465 variables='common' 466 if variables=='all': 467 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 468 'MedD','MedAD','RMedD','RMedAD','MedRD', 469 'NMBF','NMAEF','RMSD', 470 'R','R2','spearmanr','slope','intercept', 471 'fitline'] 472 elif variables=='common': 473 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope'] 474 if not isinstance(variables,list): 475 raise ValueError( 476 'variables must be a list, None, or one of these strings: "all","common"') 477 478 return variables 479 480 def summary_dict(self, variables=None, 481 fitline_kw=None, 482 floatformat_fiteqn='{:.3f}' ): 483 '''Summarize bivariate statistics into a dict 484 485 Parameters 486 ---------- 487 vars : list or str, default='common' 488 names of attribute variables to include in summary 489 names are case insensitive 490 The following strings are also accepted in place of a list 491 "all" (displays all variables) 492 "common" (displays all measures of mean difference) 493 fitline_kw : dict, default=None) 494 keywords passed to self.fitline() 495 496 Returns 497 ------- 498 summary : dict 499 names and values of variables 500 ''' 501 502 # List of variables 503 variables = self._expand_variables(variables) 504 505 if fitline_kw is None: 506 fitline_kw = {'method':'sma', 507 'intercept':True} 508 509 # Construct the dict 510 summary = {} 511 for v in variables: 512 if v in ['slope','intercept']: 513 # These variables are object methods 514 func = getattr(self,v) 515 value = func(**fitline_kw) 516 elif v == 'fitline': 517 line = self.fitline(**fitline_kw) 518 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 519 else: 520 # Retrieve values 521 value = getattr(self,v.lower()) 522 523 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 524 summary[v] = value 525 526 return summary 527 528 def summary(self, variables=None, fitline_kw=None, 529 floatformat='{:.4f}', floatformat_fiteqn=None, 530 stringlength=None ): 531 '''Summarize bivariate statistics 532 533 Parameters 534 ---------- 535 vars : list or str, default='common' 536 names of attribute variables to include in summary 537 names are case insensitive 538 The following strings are also accepted in place of a list 539 "all" (displays all variables) 540 "common" (displays all measures of mean difference) 541 floatformat : str, default='{:.4f}' 542 format specifier for floating point values 543 floatformat_fiteqn : str, default=floatformat 544 format specifier for slope and intercept (a,b) in y = a x + b 545 stringlength : int, default=None 546 length of the variables on output 547 default (None) is to use the length of the longest variable name 548 fitline_kw : dict, default=None 549 keywords passed to `fitline` 550 551 Returns 552 ------- 553 summary : str 554 names and values of variables 555 ''' 556 # List of variables 557 variables = self._expand_variables(variables) 558 559 if floatformat_fiteqn is None: 560 floatformat_fiteqn = floatformat 561 if stringlength is None: 562 stringlength = np.max([len(v) for v in variables]) 563 stringformat = '{:'+str(stringlength)+'s}' 564 565 # Get a dict containing the needed variables 566 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 567 568 # Extract length of the float numbers from floatformat 569 # import re 570 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 571 # floatformat )[0] ) ).astype(int) 572 573 # summary = (stringformat+'{:>10s}').format('Variable','Value') 574 summarytext = '' 575 for k,v in summarydict.items(): 576 if isinstance(v,str): 577 summarytext += (stringformat+' = {:s}\n').format(k,v) 578 else: 579 summarytext += (stringformat+' = '+floatformat+'\n').format(k,v) 580 581 return summarytext 582 583 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 584 floatformat='{:.3f}', floatformat_fiteqn=None, 585 loc=None, loc_units='axes', 586 **kwargs): 587 '''Display bivariate statistics as a table inset on a plot axis 588 589 Parameters 590 ---------- 591 ax : matplotlib.Figure.Axis 592 axis where the table will be displayed 593 variables : list or str, default='common' 594 names of attribute variables to include in summary 595 names are case insensitive 596 The following strings are also accepted in place of a list 597 "all" (displays all variables) 598 "common" (displays all measures of mean difference) 599 fitline_kw : dict, default=None 600 keywords passed to `fitline` 601 floatformat : str, default='{:.3f}' 602 format specifier for floating point values 603 floatformat_fiteqn : str, default=floatformat 604 format specifier for slope and intercept (a,b) in y = a x + b 605 loc : tuple (x0,y0), default=(0.85, 0.05) 606 location on the axis where the table will be drawn 607 can be in data units or axes units [0-1] 608 loc_units : {'axes' (default), 'data'} 609 specifies whether loc has 'data' units or 'axes' units [0-1] 610 611 Returns 612 ------- 613 text1, text2 : matplotlib text object 614 Artist for the two text boxes 615 ''' 616 # List of variables 617 variables = self._expand_variables(variables) 618 619 if floatformat_fiteqn is None: 620 floatformat_fiteqn = floatformat 621 622 # Default location in lower right corner 623 if loc is None: 624 loc = (0.8,0.05) 625 626 # Coordinates for loc 627 if loc_units.lower()=='data': 628 coord=ax.transData 629 elif loc_units.lower() in ['axes','axis']: 630 coord=ax.transAxes 631 else: 632 raise ValueError('Display units should be "Data" or "Axes"') 633 634 # Get a dict containing the needed variables 635 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 636 637 # Column of label text 638 label_text = '\n'.join([_texify_name(key) for key in summarydict]) 639 # Column of value text 640 value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value) 641 for value in summarydict.values()]) 642 643 # Check if horizontal alignment keyword is used 644 ha='' 645 try: 646 ha = kwargs['ha'] 647 except KeyError: 648 pass 649 try: 650 ha = kwargs['horizontalalignment'] 651 except KeyError: 652 pass 653 654 # For right alignment, align on values first 655 # Otherwise, align on labels 656 if ha=='right': 657 first_text = value_text 658 second_text = label_text 659 sign = -1 660 else: 661 first_text = label_text 662 second_text = value_text 663 sign = +1 664 665 # Add first column of text 666 t1=ax.text(loc[0],loc[1], 667 first_text, 668 transform=coord, 669 **kwargs 670 ) 671 672 # Get width of first text column 673 bbox = t1.get_window_extent().transformed(coord.inverted()) 674 width = bbox.x1-bbox.x0 675 676 # Add second column of text 677 t2 = ax.text(loc[0]+width*sign,loc[1], 678 second_text, 679 transform=coord, 680 **kwargs 681 ) 682 683 ################################## 684 # Early version of this function using matplotlib.table.table() 685 686 # if isinstance(loc,(tuple,list)): 687 # # Create an inset axis to contain the table 688 # tableaxis = ax.inset_axes(loc) 689 # table_width=1 690 # else: 691 # tableaxis = ax 692 693 # # Display the table on the axis 694 # return mtable.table( 695 # tableaxis, 696 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 697 # rowLabels=[texify_name(key) for key in summarydict], 698 # colWidths=[table_width/2]*2, 699 # edges=edges, 700 # loc=loc, bbox=bbox 701 # ) 702 703 return [t1,t2]
160class BivariateStatistics: 161 '''A suite of common statistics to quantify bivariate relationships 162 163 Class method 'summary' provides a formatted summary of these statistics 164 165 Attributes 166 ---------- 167 xmean, ymean : float 168 mean of x and y variables 169 xmedian, ymedian :float 170 median of x and y variables 171 xstd, ystd : float 172 standard deviation of x and y variables 173 mean_difference, md : float 174 ymean - xmean 175 mean_absolute_difference, mad : float 176 mean( |y-x| ) 177 relative_mean_difference, rmd : float 178 md / xmean 179 relative_mean_absolute_difference, rmad :float 180 mad / xmean 181 standardized_mean_difference, smd : float 182 md / xstd 183 standardized_mean_absolute_difference, smad : float 184 mad /xstd 185 mean_relative_difference, mrd : float 186 mean(y/x) - 1 187 mean_log10_ratio, mlr : float 188 mean( log10(y/x) ) 189 mean_absolute_log10_ratio, malr : float 190 mean( abs( log10(y/x) ) ) 191 median_difference, medd : float 192 median(y-x) 193 median_absolute_difference, medad : float 194 median(|y-x|) 195 relative_median_difference, rmedd : float 196 median(y-x) / xmedian 197 relative_median_absolute_difference, rmedad : float 198 median(|y-x|) / xmedian 199 median_relative_difference, medianrd, medrd : float 200 median(y/x)-1 201 median_log10_ratio, medlr : float 202 median( log10(y/x) ) 203 median_absolute_log10_ratio, medalr : float 204 median( abs( log10(y/x) ) ) 205 normalized_mean_bias_factor, nmbf : float 206 see `nmbf` 207 normalized_mean_absolute_error_factor, nmaef : float 208 see `nmaef` 209 root_mean_square_difference, rmsd : float 210 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 211 covariance : float 212 cov(x,y) 213 correlation_pearson, correlation, pearsonr, R, r : float 214 Pearson linear correlation coefficient 215 correlation_spearman, spearmanr : float 216 Spearman, non-parametric rank correlation coefficient 217 R2, r2 : float 218 Linear coefficient of determination, $R^2$ 219 ''' 220 221 def __init__(self,x,y,w=None,dropna=False,data=None): 222 '''Compute suite of bivariate statistics during initialization 223 224 Statistic values are saved in attributes. 225 CAUTION: Weights w are ignored except in SMA fit 226 227 Parameters 228 ---------- 229 x : ndarray or str 230 independent variable values 231 y : ndarray or str 232 dependent variable values, same size as x 233 w : ndarray or str, optional 234 weights for points (x,y), same size as x and y 235 dropna : bool, optional (default=False) 236 drops NaN values from x, y, and w 237 data : dict-like, optional 238 if x, y, or w are str, then they should be keys in data 239 ''' 240 241 # Get values from data if needed 242 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 243 raise ValueError( 'Data argument must be used if x, y, or w is a string') 244 if isinstance(x,str): 245 x = data[x] 246 if isinstance(y,str): 247 y = data[y] 248 if isinstance(w,str): 249 w = data[w] 250 251 #Ensure that x and y have same length 252 if len(x) != len(y): 253 raise ValueError( 'Arguments x and y must have the same length' ) 254 if w is None: 255 w = np.ones_like(x) 256 if len(w) != len(x): 257 raise ValueError( 'Argument w (if present) must have the same length as x' ) 258 259 # Drop NaN values 260 if dropna: 261 isna = np.isnan(x*y*w) 262 x = x[~isna] 263 y = y[~isna] 264 w = w[~isna] 265 266 diff = y - x 267 absdiff = np.abs( y - x ) 268 # Ignore divide by zero and 0/0 while dividing 269 old_settings = np.seterr(divide='ignore',invalid='ignore') 270 ratio = y/x 271 log10ratio = np.log10(ratio) 272 np.seterr(**old_settings) 273 274 # Means, medians, and standard deviations 275 self.xmean = np.mean(x) 276 self.ymean = np.mean(y) 277 self.xmedian = np.median(x) 278 self.ymedian = np.median(y) 279 self.xstd = np.std(x) 280 self.ystd = np.std(y) 281 282 # Save values for use later 283 self._x = x 284 self._y = y 285 self._w = w 286 287 # Mean and mean absolute differences 288 self.mean_difference = self.md = self.ymean - self.xmean 289 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 290 291 # Relative and standardized differences 292 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 293 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 294 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 295 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 296 297 # Mean and median relative differences 298 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 299 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 300 301 # Median and median absolute differences 302 self.median_difference = self.medd = np.median( diff ) 303 self.median_absolute_difference = self.medad = np.median( absdiff ) 304 305 # Relative median differences 306 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 307 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 308 309 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 310 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 311 312 # Mean and mean absolute log ratio 313 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 314 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 315 316 # Median and median absolute log ratio 317 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 318 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 319 320 # RMS difference 321 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 322 323 # Covariance, correlation 324 self.covariance = np.cov(x,y)[0][1] 325 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 326 np.corrcoef(x,y)[0][1] 327 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 328 self.R2 = self.r2 = self.R**2 329 330 def __getitem__(self,key): 331 '''Accesses attribute values via object['key']''' 332 return getattr(self,key) 333 334 def fitline(self,method='sma',intercept=True,**kwargs): 335 '''Compute bivariate line fit 336 337 Parameters 338 ---------- 339 method : str 340 line fitting method: sma (default), ols, wls, York, sen, siegel 341 intercept : bool 342 defines whether non-zero intercept should be fitted 343 **kwargs 344 passed to `acgc.stats.sma` (e.g. robust=True) 345 346 Returns 347 ------- 348 result : dict 349 dictionary with keys: 350 - slope (float) 351 slope of fitted line 352 - intercept (float) 353 intercept of fitted line 354 - fittedvalues (array (N,)) 355 values on fit line 356 - residuals (array (N,)) 357 residual from fit line 358 ''' 359 360 fitintercept = intercept 361 362 if method.lower()=='sma': 363 fit = sma( self._x, 364 self._y, 365 self._w, 366 intercept=fitintercept, 367 **kwargs) 368 slope = fit['slope'] 369 intercept= fit['intercept'] 370 371 elif method.lower()=='ols': 372 if fitintercept: 373 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 374 self._y, rcond=None ) 375 else: 376 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 377 slope = ols[0][0] 378 intercept = ols[0][1] 379 380 elif method.lower() in ['theil','sen','theilsen']: 381 fitintercept = True 382 fit = sen( self._x, 383 self._y, 384 **kwargs) 385 slope = fit.slope 386 intercept = fit.intercept 387 388 elif method.lower()=='siegel': 389 fitintercept = True 390 siegel = stats.siegelslopes( self._x, 391 self._y ) 392 slope = siegel.slope 393 intercept = siegel.intercept 394 395 elif method.lower()=='wls': 396 raise NotImplementedError('WLS regression not implemented yet') 397 398 elif method.lower()=='york': 399 raise NotImplementedError('York regression not implemented yet') 400 401 else: 402 raise ValueError('Undefined method '+method) 403 404 line = dict( slope = slope, 405 intercept = intercept, 406 fittedvalues = slope * self._x + intercept, 407 residuals = self._y - ( slope * self._x + intercept ), 408 method = method, 409 fitintercept = fitintercept ) 410 411 return line 412 413 def slope(self,method='sma',intercept=True,**kwargs): 414 '''Compute slope of bivariate line fit 415 416 Parameters 417 ---------- 418 method : str 419 line fitting method: sma (default), ols, wls 420 intercept : bool 421 defines whether non-zero intercept should be fitted 422 **kwargs 423 passed to `fitline` 424 425 Returns 426 ------- 427 slope : float 428 value of y intercept 429 ''' 430 return self.fitline(method,intercept,**kwargs)['slope'] 431 432 def intercept(self,method='sma',intercept=True,**kwargs): 433 '''Compute intercept of bivariate line fit 434 435 Parameters 436 ---------- 437 method : str 438 line fitting method: sma (default) or ols 439 intercept : bool 440 defines whether non-zero intercept should be fitted 441 **kwargs 442 passed to `fitline` 443 444 Returns 445 ------- 446 intercept : float 447 value of y intercept 448 ''' 449 return self.fitline(method,intercept,**kwargs)['intercept'] 450 451 def _expand_variables(self,variables): 452 '''Expand special strings into a list of variables 453 454 Parameter 455 --------- 456 variables : list or str, default='common' 457 Special strings ("all","common") will be expanded to a list of variables 458 list arguments will not be modified 459 460 Returns 461 ------- 462 list 463 variable names 464 ''' 465 if variables is None: 466 variables='common' 467 if variables=='all': 468 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 469 'MedD','MedAD','RMedD','RMedAD','MedRD', 470 'NMBF','NMAEF','RMSD', 471 'R','R2','spearmanr','slope','intercept', 472 'fitline'] 473 elif variables=='common': 474 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope'] 475 if not isinstance(variables,list): 476 raise ValueError( 477 'variables must be a list, None, or one of these strings: "all","common"') 478 479 return variables 480 481 def summary_dict(self, variables=None, 482 fitline_kw=None, 483 floatformat_fiteqn='{:.3f}' ): 484 '''Summarize bivariate statistics into a dict 485 486 Parameters 487 ---------- 488 vars : list or str, default='common' 489 names of attribute variables to include in summary 490 names are case insensitive 491 The following strings are also accepted in place of a list 492 "all" (displays all variables) 493 "common" (displays all measures of mean difference) 494 fitline_kw : dict, default=None) 495 keywords passed to self.fitline() 496 497 Returns 498 ------- 499 summary : dict 500 names and values of variables 501 ''' 502 503 # List of variables 504 variables = self._expand_variables(variables) 505 506 if fitline_kw is None: 507 fitline_kw = {'method':'sma', 508 'intercept':True} 509 510 # Construct the dict 511 summary = {} 512 for v in variables: 513 if v in ['slope','intercept']: 514 # These variables are object methods 515 func = getattr(self,v) 516 value = func(**fitline_kw) 517 elif v == 'fitline': 518 line = self.fitline(**fitline_kw) 519 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 520 else: 521 # Retrieve values 522 value = getattr(self,v.lower()) 523 524 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 525 summary[v] = value 526 527 return summary 528 529 def summary(self, variables=None, fitline_kw=None, 530 floatformat='{:.4f}', floatformat_fiteqn=None, 531 stringlength=None ): 532 '''Summarize bivariate statistics 533 534 Parameters 535 ---------- 536 vars : list or str, default='common' 537 names of attribute variables to include in summary 538 names are case insensitive 539 The following strings are also accepted in place of a list 540 "all" (displays all variables) 541 "common" (displays all measures of mean difference) 542 floatformat : str, default='{:.4f}' 543 format specifier for floating point values 544 floatformat_fiteqn : str, default=floatformat 545 format specifier for slope and intercept (a,b) in y = a x + b 546 stringlength : int, default=None 547 length of the variables on output 548 default (None) is to use the length of the longest variable name 549 fitline_kw : dict, default=None 550 keywords passed to `fitline` 551 552 Returns 553 ------- 554 summary : str 555 names and values of variables 556 ''' 557 # List of variables 558 variables = self._expand_variables(variables) 559 560 if floatformat_fiteqn is None: 561 floatformat_fiteqn = floatformat 562 if stringlength is None: 563 stringlength = np.max([len(v) for v in variables]) 564 stringformat = '{:'+str(stringlength)+'s}' 565 566 # Get a dict containing the needed variables 567 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 568 569 # Extract length of the float numbers from floatformat 570 # import re 571 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 572 # floatformat )[0] ) ).astype(int) 573 574 # summary = (stringformat+'{:>10s}').format('Variable','Value') 575 summarytext = '' 576 for k,v in summarydict.items(): 577 if isinstance(v,str): 578 summarytext += (stringformat+' = {:s}\n').format(k,v) 579 else: 580 summarytext += (stringformat+' = '+floatformat+'\n').format(k,v) 581 582 return summarytext 583 584 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 585 floatformat='{:.3f}', floatformat_fiteqn=None, 586 loc=None, loc_units='axes', 587 **kwargs): 588 '''Display bivariate statistics as a table inset on a plot axis 589 590 Parameters 591 ---------- 592 ax : matplotlib.Figure.Axis 593 axis where the table will be displayed 594 variables : list or str, default='common' 595 names of attribute variables to include in summary 596 names are case insensitive 597 The following strings are also accepted in place of a list 598 "all" (displays all variables) 599 "common" (displays all measures of mean difference) 600 fitline_kw : dict, default=None 601 keywords passed to `fitline` 602 floatformat : str, default='{:.3f}' 603 format specifier for floating point values 604 floatformat_fiteqn : str, default=floatformat 605 format specifier for slope and intercept (a,b) in y = a x + b 606 loc : tuple (x0,y0), default=(0.85, 0.05) 607 location on the axis where the table will be drawn 608 can be in data units or axes units [0-1] 609 loc_units : {'axes' (default), 'data'} 610 specifies whether loc has 'data' units or 'axes' units [0-1] 611 612 Returns 613 ------- 614 text1, text2 : matplotlib text object 615 Artist for the two text boxes 616 ''' 617 # List of variables 618 variables = self._expand_variables(variables) 619 620 if floatformat_fiteqn is None: 621 floatformat_fiteqn = floatformat 622 623 # Default location in lower right corner 624 if loc is None: 625 loc = (0.8,0.05) 626 627 # Coordinates for loc 628 if loc_units.lower()=='data': 629 coord=ax.transData 630 elif loc_units.lower() in ['axes','axis']: 631 coord=ax.transAxes 632 else: 633 raise ValueError('Display units should be "Data" or "Axes"') 634 635 # Get a dict containing the needed variables 636 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 637 638 # Column of label text 639 label_text = '\n'.join([_texify_name(key) for key in summarydict]) 640 # Column of value text 641 value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value) 642 for value in summarydict.values()]) 643 644 # Check if horizontal alignment keyword is used 645 ha='' 646 try: 647 ha = kwargs['ha'] 648 except KeyError: 649 pass 650 try: 651 ha = kwargs['horizontalalignment'] 652 except KeyError: 653 pass 654 655 # For right alignment, align on values first 656 # Otherwise, align on labels 657 if ha=='right': 658 first_text = value_text 659 second_text = label_text 660 sign = -1 661 else: 662 first_text = label_text 663 second_text = value_text 664 sign = +1 665 666 # Add first column of text 667 t1=ax.text(loc[0],loc[1], 668 first_text, 669 transform=coord, 670 **kwargs 671 ) 672 673 # Get width of first text column 674 bbox = t1.get_window_extent().transformed(coord.inverted()) 675 width = bbox.x1-bbox.x0 676 677 # Add second column of text 678 t2 = ax.text(loc[0]+width*sign,loc[1], 679 second_text, 680 transform=coord, 681 **kwargs 682 ) 683 684 ################################## 685 # Early version of this function using matplotlib.table.table() 686 687 # if isinstance(loc,(tuple,list)): 688 # # Create an inset axis to contain the table 689 # tableaxis = ax.inset_axes(loc) 690 # table_width=1 691 # else: 692 # tableaxis = ax 693 694 # # Display the table on the axis 695 # return mtable.table( 696 # tableaxis, 697 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 698 # rowLabels=[texify_name(key) for key in summarydict], 699 # colWidths=[table_width/2]*2, 700 # edges=edges, 701 # loc=loc, bbox=bbox 702 # ) 703 704 return [t1,t2]
A suite of common statistics to quantify bivariate relationships
Class method 'summary' provides a formatted summary of these statistics
Attributes
- xmean, ymean (float): mean of x and y variables
- xmedian, ymedian (float): median of x and y variables
- xstd, ystd (float): standard deviation of x and y variables
- mean_difference, md (float): ymean - xmean
- mean_absolute_difference, mad (float): mean( |y-x| )
- relative_mean_difference, rmd (float): md / xmean
- relative_mean_absolute_difference, rmad (float): mad / xmean
- standardized_mean_difference, smd (float): md / xstd
- standardized_mean_absolute_difference, smad (float): mad /xstd
- mean_relative_difference, mrd (float): mean(y/x) - 1
- mean_log10_ratio, mlr (float): mean( log10(y/x) )
- mean_absolute_log10_ratio, malr (float): mean( abs( log10(y/x) ) )
- median_difference, medd (float): median(y-x)
- median_absolute_difference, medad (float): median(|y-x|)
- relative_median_difference, rmedd (float): median(y-x) / xmedian
- relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
- median_relative_difference, medianrd, medrd (float): median(y/x)-1
- median_log10_ratio, medlr (float): median( log10(y/x) )
- median_absolute_log10_ratio, medalr (float): median( abs( log10(y/x) ) )
- normalized_mean_bias_factor, nmbf (float):
see
nmbf
- normalized_mean_absolute_error_factor, nmaef (float):
see
nmaef
- root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
- covariance (float): cov(x,y)
- correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
- correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
- R2, r2 (float): Linear coefficient of determination, $R^2$
221 def __init__(self,x,y,w=None,dropna=False,data=None): 222 '''Compute suite of bivariate statistics during initialization 223 224 Statistic values are saved in attributes. 225 CAUTION: Weights w are ignored except in SMA fit 226 227 Parameters 228 ---------- 229 x : ndarray or str 230 independent variable values 231 y : ndarray or str 232 dependent variable values, same size as x 233 w : ndarray or str, optional 234 weights for points (x,y), same size as x and y 235 dropna : bool, optional (default=False) 236 drops NaN values from x, y, and w 237 data : dict-like, optional 238 if x, y, or w are str, then they should be keys in data 239 ''' 240 241 # Get values from data if needed 242 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 243 raise ValueError( 'Data argument must be used if x, y, or w is a string') 244 if isinstance(x,str): 245 x = data[x] 246 if isinstance(y,str): 247 y = data[y] 248 if isinstance(w,str): 249 w = data[w] 250 251 #Ensure that x and y have same length 252 if len(x) != len(y): 253 raise ValueError( 'Arguments x and y must have the same length' ) 254 if w is None: 255 w = np.ones_like(x) 256 if len(w) != len(x): 257 raise ValueError( 'Argument w (if present) must have the same length as x' ) 258 259 # Drop NaN values 260 if dropna: 261 isna = np.isnan(x*y*w) 262 x = x[~isna] 263 y = y[~isna] 264 w = w[~isna] 265 266 diff = y - x 267 absdiff = np.abs( y - x ) 268 # Ignore divide by zero and 0/0 while dividing 269 old_settings = np.seterr(divide='ignore',invalid='ignore') 270 ratio = y/x 271 log10ratio = np.log10(ratio) 272 np.seterr(**old_settings) 273 274 # Means, medians, and standard deviations 275 self.xmean = np.mean(x) 276 self.ymean = np.mean(y) 277 self.xmedian = np.median(x) 278 self.ymedian = np.median(y) 279 self.xstd = np.std(x) 280 self.ystd = np.std(y) 281 282 # Save values for use later 283 self._x = x 284 self._y = y 285 self._w = w 286 287 # Mean and mean absolute differences 288 self.mean_difference = self.md = self.ymean - self.xmean 289 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 290 291 # Relative and standardized differences 292 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 293 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 294 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 295 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 296 297 # Mean and median relative differences 298 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 299 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 300 301 # Median and median absolute differences 302 self.median_difference = self.medd = np.median( diff ) 303 self.median_absolute_difference = self.medad = np.median( absdiff ) 304 305 # Relative median differences 306 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 307 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 308 309 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 310 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 311 312 # Mean and mean absolute log ratio 313 self.mean_log10_ratio = self.mlr = np.mean( log10ratio ) 314 self.mean_absolute_log10_ratio = self.malr = np.mean( np.abs( log10ratio ) ) 315 316 # Median and median absolute log ratio 317 self.median_log10_ratio = self.medlr = np.median( log10ratio ) 318 self.median_absolute_log10_ratio = self.medalr = np.median( np.abs( log10ratio ) ) 319 320 # RMS difference 321 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 322 323 # Covariance, correlation 324 self.covariance = np.cov(x,y)[0][1] 325 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 326 np.corrcoef(x,y)[0][1] 327 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 328 self.R2 = self.r2 = self.R**2
Compute suite of bivariate statistics during initialization
Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit
Parameters
- x (ndarray or str): independent variable values
- y (ndarray or str): dependent variable values, same size as x
- w (ndarray or str, optional): weights for points (x,y), same size as x and y
- dropna (bool, optional (default=False)): drops NaN values from x, y, and w
- data (dict-like, optional): if x, y, or w are str, then they should be keys in data
334 def fitline(self,method='sma',intercept=True,**kwargs): 335 '''Compute bivariate line fit 336 337 Parameters 338 ---------- 339 method : str 340 line fitting method: sma (default), ols, wls, York, sen, siegel 341 intercept : bool 342 defines whether non-zero intercept should be fitted 343 **kwargs 344 passed to `acgc.stats.sma` (e.g. robust=True) 345 346 Returns 347 ------- 348 result : dict 349 dictionary with keys: 350 - slope (float) 351 slope of fitted line 352 - intercept (float) 353 intercept of fitted line 354 - fittedvalues (array (N,)) 355 values on fit line 356 - residuals (array (N,)) 357 residual from fit line 358 ''' 359 360 fitintercept = intercept 361 362 if method.lower()=='sma': 363 fit = sma( self._x, 364 self._y, 365 self._w, 366 intercept=fitintercept, 367 **kwargs) 368 slope = fit['slope'] 369 intercept= fit['intercept'] 370 371 elif method.lower()=='ols': 372 if fitintercept: 373 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 374 self._y, rcond=None ) 375 else: 376 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 377 slope = ols[0][0] 378 intercept = ols[0][1] 379 380 elif method.lower() in ['theil','sen','theilsen']: 381 fitintercept = True 382 fit = sen( self._x, 383 self._y, 384 **kwargs) 385 slope = fit.slope 386 intercept = fit.intercept 387 388 elif method.lower()=='siegel': 389 fitintercept = True 390 siegel = stats.siegelslopes( self._x, 391 self._y ) 392 slope = siegel.slope 393 intercept = siegel.intercept 394 395 elif method.lower()=='wls': 396 raise NotImplementedError('WLS regression not implemented yet') 397 398 elif method.lower()=='york': 399 raise NotImplementedError('York regression not implemented yet') 400 401 else: 402 raise ValueError('Undefined method '+method) 403 404 line = dict( slope = slope, 405 intercept = intercept, 406 fittedvalues = slope * self._x + intercept, 407 residuals = self._y - ( slope * self._x + intercept ), 408 method = method, 409 fitintercept = fitintercept ) 410 411 return line
Compute bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
acgc.stats.sma
(e.g. robust=True)
Returns
- result (dict):
dictionary with keys:
- slope (float) slope of fitted line
- intercept (float) intercept of fitted line
- fittedvalues (array (N,)) values on fit line
- residuals (array (N,)) residual from fit line
413 def slope(self,method='sma',intercept=True,**kwargs): 414 '''Compute slope of bivariate line fit 415 416 Parameters 417 ---------- 418 method : str 419 line fitting method: sma (default), ols, wls 420 intercept : bool 421 defines whether non-zero intercept should be fitted 422 **kwargs 423 passed to `fitline` 424 425 Returns 426 ------- 427 slope : float 428 value of y intercept 429 ''' 430 return self.fitline(method,intercept,**kwargs)['slope']
Compute slope of bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- slope (float): value of y intercept
432 def intercept(self,method='sma',intercept=True,**kwargs): 433 '''Compute intercept of bivariate line fit 434 435 Parameters 436 ---------- 437 method : str 438 line fitting method: sma (default) or ols 439 intercept : bool 440 defines whether non-zero intercept should be fitted 441 **kwargs 442 passed to `fitline` 443 444 Returns 445 ------- 446 intercept : float 447 value of y intercept 448 ''' 449 return self.fitline(method,intercept,**kwargs)['intercept']
Compute intercept of bivariate line fit
Parameters
- method (str): line fitting method: sma (default) or ols
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- intercept (float): value of y intercept
481 def summary_dict(self, variables=None, 482 fitline_kw=None, 483 floatformat_fiteqn='{:.3f}' ): 484 '''Summarize bivariate statistics into a dict 485 486 Parameters 487 ---------- 488 vars : list or str, default='common' 489 names of attribute variables to include in summary 490 names are case insensitive 491 The following strings are also accepted in place of a list 492 "all" (displays all variables) 493 "common" (displays all measures of mean difference) 494 fitline_kw : dict, default=None) 495 keywords passed to self.fitline() 496 497 Returns 498 ------- 499 summary : dict 500 names and values of variables 501 ''' 502 503 # List of variables 504 variables = self._expand_variables(variables) 505 506 if fitline_kw is None: 507 fitline_kw = {'method':'sma', 508 'intercept':True} 509 510 # Construct the dict 511 summary = {} 512 for v in variables: 513 if v in ['slope','intercept']: 514 # These variables are object methods 515 func = getattr(self,v) 516 value = func(**fitline_kw) 517 elif v == 'fitline': 518 line = self.fitline(**fitline_kw) 519 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 520 else: 521 # Retrieve values 522 value = getattr(self,v.lower()) 523 524 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 525 summary[v] = value 526 527 return summary
Summarize bivariate statistics into a dict
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None)): keywords passed to self.fitline()
Returns
- summary (dict): names and values of variables
529 def summary(self, variables=None, fitline_kw=None, 530 floatformat='{:.4f}', floatformat_fiteqn=None, 531 stringlength=None ): 532 '''Summarize bivariate statistics 533 534 Parameters 535 ---------- 536 vars : list or str, default='common' 537 names of attribute variables to include in summary 538 names are case insensitive 539 The following strings are also accepted in place of a list 540 "all" (displays all variables) 541 "common" (displays all measures of mean difference) 542 floatformat : str, default='{:.4f}' 543 format specifier for floating point values 544 floatformat_fiteqn : str, default=floatformat 545 format specifier for slope and intercept (a,b) in y = a x + b 546 stringlength : int, default=None 547 length of the variables on output 548 default (None) is to use the length of the longest variable name 549 fitline_kw : dict, default=None 550 keywords passed to `fitline` 551 552 Returns 553 ------- 554 summary : str 555 names and values of variables 556 ''' 557 # List of variables 558 variables = self._expand_variables(variables) 559 560 if floatformat_fiteqn is None: 561 floatformat_fiteqn = floatformat 562 if stringlength is None: 563 stringlength = np.max([len(v) for v in variables]) 564 stringformat = '{:'+str(stringlength)+'s}' 565 566 # Get a dict containing the needed variables 567 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 568 569 # Extract length of the float numbers from floatformat 570 # import re 571 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 572 # floatformat )[0] ) ).astype(int) 573 574 # summary = (stringformat+'{:>10s}').format('Variable','Value') 575 summarytext = '' 576 for k,v in summarydict.items(): 577 if isinstance(v,str): 578 summarytext += (stringformat+' = {:s}\n').format(k,v) 579 else: 580 summarytext += (stringformat+' = '+floatformat+'\n').format(k,v) 581 582 return summarytext
Summarize bivariate statistics
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - floatformat : str, default='{ (.4f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
- fitline_kw (dict, default=None):
keywords passed to
fitline
Returns
- summary (str): names and values of variables
584 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 585 floatformat='{:.3f}', floatformat_fiteqn=None, 586 loc=None, loc_units='axes', 587 **kwargs): 588 '''Display bivariate statistics as a table inset on a plot axis 589 590 Parameters 591 ---------- 592 ax : matplotlib.Figure.Axis 593 axis where the table will be displayed 594 variables : list or str, default='common' 595 names of attribute variables to include in summary 596 names are case insensitive 597 The following strings are also accepted in place of a list 598 "all" (displays all variables) 599 "common" (displays all measures of mean difference) 600 fitline_kw : dict, default=None 601 keywords passed to `fitline` 602 floatformat : str, default='{:.3f}' 603 format specifier for floating point values 604 floatformat_fiteqn : str, default=floatformat 605 format specifier for slope and intercept (a,b) in y = a x + b 606 loc : tuple (x0,y0), default=(0.85, 0.05) 607 location on the axis where the table will be drawn 608 can be in data units or axes units [0-1] 609 loc_units : {'axes' (default), 'data'} 610 specifies whether loc has 'data' units or 'axes' units [0-1] 611 612 Returns 613 ------- 614 text1, text2 : matplotlib text object 615 Artist for the two text boxes 616 ''' 617 # List of variables 618 variables = self._expand_variables(variables) 619 620 if floatformat_fiteqn is None: 621 floatformat_fiteqn = floatformat 622 623 # Default location in lower right corner 624 if loc is None: 625 loc = (0.8,0.05) 626 627 # Coordinates for loc 628 if loc_units.lower()=='data': 629 coord=ax.transData 630 elif loc_units.lower() in ['axes','axis']: 631 coord=ax.transAxes 632 else: 633 raise ValueError('Display units should be "Data" or "Axes"') 634 635 # Get a dict containing the needed variables 636 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 637 638 # Column of label text 639 label_text = '\n'.join([_texify_name(key) for key in summarydict]) 640 # Column of value text 641 value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value) 642 for value in summarydict.values()]) 643 644 # Check if horizontal alignment keyword is used 645 ha='' 646 try: 647 ha = kwargs['ha'] 648 except KeyError: 649 pass 650 try: 651 ha = kwargs['horizontalalignment'] 652 except KeyError: 653 pass 654 655 # For right alignment, align on values first 656 # Otherwise, align on labels 657 if ha=='right': 658 first_text = value_text 659 second_text = label_text 660 sign = -1 661 else: 662 first_text = label_text 663 second_text = value_text 664 sign = +1 665 666 # Add first column of text 667 t1=ax.text(loc[0],loc[1], 668 first_text, 669 transform=coord, 670 **kwargs 671 ) 672 673 # Get width of first text column 674 bbox = t1.get_window_extent().transformed(coord.inverted()) 675 width = bbox.x1-bbox.x0 676 677 # Add second column of text 678 t2 = ax.text(loc[0]+width*sign,loc[1], 679 second_text, 680 transform=coord, 681 **kwargs 682 ) 683 684 ################################## 685 # Early version of this function using matplotlib.table.table() 686 687 # if isinstance(loc,(tuple,list)): 688 # # Create an inset axis to contain the table 689 # tableaxis = ax.inset_axes(loc) 690 # table_width=1 691 # else: 692 # tableaxis = ax 693 694 # # Display the table on the axis 695 # return mtable.table( 696 # tableaxis, 697 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 698 # rowLabels=[texify_name(key) for key in summarydict], 699 # colWidths=[table_width/2]*2, 700 # edges=edges, 701 # loc=loc, bbox=bbox 702 # ) 703 704 return [t1,t2]
Display bivariate statistics as a table inset on a plot axis
Parameters
- ax (matplotlib.Figure.Axis): axis where the table will be displayed
- variables (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline
- floatformat : str, default='{ (.3f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
- loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
- text1, text2 (matplotlib text object): Artist for the two text boxes
22def nmb( x0, x1 ): 23 '''Compute Normalized Mean Bias (NMB) 24 25 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 26 27 Parameters 28 ---------- 29 x0 : array_like 30 reference values 31 x1 : array_like 32 experiment values 33 ''' 34 35 assert (len(x0) == len(x1)), \ 36 "Parameters x0 and x1 must have the same length" 37 38 # Mean values 39 x0_mean = np.mean(x0) 40 x1_mean = np.mean(x1) 41 42 # Metric value 43 return x1_mean / x0_mean - 1
Compute Normalized Mean Bias (NMB)
NMB = ( mean(x1) - mean(x0) ) / mean(x0)
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
45def nmae( x0, x1 ): 46 '''Compute Normalized Mean Absolute Error (NMAE) 47 48 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 49 50 Parameters 51 --------- 52 x0 : array_like 53 reference values 54 x1 : array_like 55 experiment values 56 ''' 57 58 # Mean values 59 x0_mean = np.mean(x0) 60 61 # Mean absolute difference 62 abs_diff = np.mean( np.abs(x1 - x0) ) 63 64 # Metric value 65 return abs_diff / np.abs( x0_mean )
Compute Normalized Mean Absolute Error (NMAE)
NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
68def nmbf( x0, x1 ): 69 '''Compute Normalized Mean Bias Factor (NMBF) 70 71 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 72 73 Parameters 74 ---------- 75 x0 : array_like 76 reference values 77 x1 : array_like 78 experiment values 79 ''' 80 81 # Ensure that arguments have the same length 82 assert (len(x0) == len(x1)), \ 83 "Parameters x0 and x1 must have the same length" 84 85 # Mean values 86 x0_mean = np.mean(x0) 87 x1_mean = np.mean(x1) 88 89 # Metric value 90 if x1_mean >= x0_mean: 91 result = x1_mean / x0_mean - 1 92 else: 93 result= 1 - x0_mean / x1_mean 94 # Equivalent (faster?) implementation 95 #S = (mMean - oMean) / np.abs(mMean - oMean) 96 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 97 98 return result
Compute Normalized Mean Bias Factor (NMBF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
100def nmaef( x0, x1 ): 101 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 102 103 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 104 105 Parameters 106 ---------- 107 x0 : array_like 108 reference values 109 x1 : array_like 110 experiment values 111 ''' 112 113 # Ensure that arguments have the same length 114 assert (len(x0) == len(x1)), \ 115 "Parameters x0 and x1 must have the same length" 116 117 # Mean values 118 x0_mean = np.mean(x0) 119 x1_mean = np.mean(x1) 120 121 # Mean absolute difference 122 abs_diff = np.mean( np.abs(x1 - x0)) 123 124 # Metric value 125 if x1_mean >= x0_mean: 126 result = abs_diff / x0_mean 127 else: 128 result = abs_diff / x1_mean 129 # Equivalent (faster?) implementation 130 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 131 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 132 133 return result
Compute Normalized Mean Absolute Error Factor (NMAEF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values