acgc.stats.bivariate
Bivariate statistics
Statistical measures of relationships between two populations
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3""" Bivariate statistics 4 5Statistical measures of relationships between two populations 6""" 7 8import numpy as np 9from scipy import stats 10from .bivariate_lines import sen, sma, bivariate_line_equation 11# import xarray as xr 12 13__all__ = [ 14 "BivariateStatistics", 15 "nmb", 16 "nmae", 17 "nmbf", 18 "nmaef" 19] 20 21def nmb( x0, x1 ): 22 '''Compute Normalized Mean Bias (NMB) 23 24 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 25 26 Parameters 27 ---------- 28 x0 : array_like 29 reference values 30 x1 : array_like 31 experiment values 32 ''' 33 34 assert (len(x0) == len(x1)), \ 35 "Parameters x0 and x1 must have the same length" 36 37 # Mean values 38 x0_mean = np.mean(x0) 39 x1_mean = np.mean(x1) 40 41 # Metric value 42 return x1_mean / x0_mean - 1 43 44def nmae( x0, x1 ): 45 '''Compute Normalized Mean Absolute Error (NMAE) 46 47 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 48 49 Parameters 50 --------- 51 x0 : array_like 52 reference values 53 x1 : array_like 54 experiment values 55 ''' 56 57 # Mean values 58 x0_mean = np.mean(x0) 59 60 # Mean absolute difference 61 abs_diff = np.mean( np.abs(x1 - x0) ) 62 63 # Metric value 64 return abs_diff / np.abs( x0_mean ) 65 66 67def nmbf( x0, x1 ): 68 '''Compute Normalized Mean Bias Factor (NMBF) 69 70 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 71 72 Parameters 73 ---------- 74 x0 : array_like 75 reference values 76 x1 : array_like 77 experiment values 78 ''' 79 80 # Ensure that arguments have the same length 81 assert (len(x0) == len(x1)), \ 82 "Parameters x0 and x1 must have the same length" 83 84 # Mean values 85 x0_mean = np.mean(x0) 86 x1_mean = np.mean(x1) 87 88 # Metric value 89 if x1_mean >= x0_mean: 90 result = x1_mean / x0_mean - 1 91 else: 92 result= 1 - x0_mean / x1_mean 93 # Equivalent (faster?) implementation 94 #S = (mMean - oMean) / np.abs(mMean - oMean) 95 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 96 97 return result 98 99def nmaef( x0, x1 ): 100 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 101 102 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 103 104 Parameters 105 ---------- 106 x0 : array_like 107 reference values 108 x1 : array_like 109 experiment values 110 ''' 111 112 # Ensure that arguments have the same length 113 assert (len(x0) == len(x1)), \ 114 "Parameters x0 and x1 must have the same length" 115 116 # Mean values 117 x0_mean = np.mean(x0) 118 x1_mean = np.mean(x1) 119 120 # Mean absolute difference 121 abs_diff = np.mean( np.abs(x1 - x0)) 122 123 # Metric value 124 if x1_mean >= x0_mean: 125 result = abs_diff / x0_mean 126 else: 127 result = abs_diff / x1_mean 128 # Equivalent (faster?) implementation 129 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 130 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 131 132 return result 133 134def _texify_name(name): 135 '''Return a LaTex formatted string for some variables 136 137 Parameter 138 --------- 139 name : str 140 141 Returns 142 ------- 143 pretty_name : str 144 ''' 145 if name=='R2': 146 pretty_name = f'$R^2$' 147 elif name=='r2': 148 pretty_name = f'$r^2$' 149 elif name.lower()=='y_ols': 150 pretty_name = r'$y_{\rm OLS}$' 151 elif name.lower()=='y_sma': 152 pretty_name = r'$y_{\rm SMA}$' 153 elif name.lower()=='y_sen': 154 pretty_name = r'$y_{\rm Sen}$' 155 else: 156 pretty_name = name 157 return pretty_name 158 159class BivariateStatistics: 160 '''A suite of common statistics to quantify bivariate relationships 161 162 Class method 'summary' provides a formatted summary of these statistics 163 164 Attributes 165 ---------- 166 xmean, ymean : float 167 mean of x and y variables 168 xmedian, ymedian :float 169 median of x and y variables 170 xstd, ystd : float 171 standard deviation of x and y variables 172 mean_difference, md : float 173 ymean - xmean 174 mean_absolute_difference, mad : float 175 mean( |y-x| ) 176 relative_mean_difference, rmd : float 177 md / xmean 178 relative_mean_absolute_difference, rmad :float 179 mad / xmean 180 standardized_mean_difference, smd : float 181 md / xstd 182 standardized_mean_absolute_difference, smad : float 183 mad /xstd 184 mean_relative_difference, mrd : float 185 mean(y/x) - 1 186 median_difference, medd : float 187 median(y-x) 188 median_absolute_difference, medad : float 189 median(|y-x|) 190 relative_median_difference, rmedd : float 191 median(y-x) / xmedian 192 relative_median_absolute_difference, rmedad : float 193 median(|y-x|) / xmedian 194 median_relative_difference, medianrd, medrd : float 195 median(y/x)-1 196 normalized_mean_bias_factor, nmbf : float 197 see `nmbf` 198 normalized_mean_absolute_error_factor, nmaef : float 199 see `nmaef` 200 root_mean_square_difference, rmsd : float 201 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 202 covariance : float 203 cov(x,y) 204 correlation_pearson, correlation, pearsonr, R, r : float 205 Pearson linear correlation coefficient 206 correlation_spearman, spearmanr : float 207 Spearman, non-parametric rank correlation coefficient 208 R2, r2 : float 209 Linear coefficient of determination, $R^2$ 210 ''' 211 212 def __init__(self,x,y,w=None,dropna=False,data=None): 213 '''Compute suite of bivariate statistics during initialization 214 215 Statistic values are saved in attributes. 216 CAUTION: Weights w are ignored except in SMA fit 217 218 Parameters 219 ---------- 220 x : ndarray or str 221 independent variable values 222 y : ndarray or str 223 dependent variable values, same size as x 224 w : ndarray or str, optional 225 weights for points (x,y), same size as x and y 226 dropna : bool, optional (default=False) 227 drops NaN values from x, y, and w 228 data : dict-like, optional 229 if x, y, or w are str, then they should be keys in data 230 ''' 231 232 # Get values from data if needed 233 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 234 raise ValueError( 'Data argument must be used if x, y, or w is a string') 235 if isinstance(x,str): 236 x = data[x] 237 if isinstance(y,str): 238 y = data[y] 239 if isinstance(w,str): 240 w = data[w] 241 242 #Ensure that x and y have same length 243 if len(x) != len(y): 244 raise ValueError( 'Arguments x and y must have the same length' ) 245 if w is None: 246 w = np.ones_like(x) 247 if len(w) != len(x): 248 raise ValueError( 'Argument w (if present) must have the same length as x' ) 249 250 # Drop NaN values 251 if dropna: 252 isna = np.isnan(x*y*w) 253 x = x[~isna] 254 y = y[~isna] 255 w = w[~isna] 256 257 diff = y - x 258 absdiff = np.abs( y - x ) 259 # Ignore divide by zero and 0/0 while dividing 260 old_settings = np.seterr(divide='ignore',invalid='ignore') 261 ratio = y/x 262 np.seterr(**old_settings) 263 264 # Means, medians, and standard deviations 265 self.xmean = np.mean(x) 266 self.ymean = np.mean(y) 267 self.xmedian = np.median(x) 268 self.ymedian = np.median(y) 269 self.xstd = np.std(x) 270 self.ystd = np.std(y) 271 272 # Save values for use later 273 self._x = x 274 self._y = y 275 self._w = w 276 277 # Mean and mean absolute differences 278 self.mean_difference = self.md = self.ymean - self.xmean 279 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 280 281 # Relative and standardized differences 282 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 283 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 284 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 285 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 286 287 # Mean and median relative differences 288 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 289 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 290 291 # Median and median absolute differences 292 self.median_difference = self.medd = np.median( diff ) 293 self.median_absolute_difference = self.medad = np.median( absdiff ) 294 295 # Relative median differences 296 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 297 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 298 299 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 300 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 301 302 # RMS difference 303 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 304 305 # Covariance, correlation 306 self.covariance = np.cov(x,y)[0][1] 307 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 308 np.corrcoef(x,y)[0][1] 309 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 310 self.R2 = self.r2 = self.R**2 311 312 def __getitem__(self,key): 313 '''Accesses attribute values via object['key']''' 314 return getattr(self,key) 315 316 def fitline(self,method='sma',intercept=True,**kwargs): 317 '''Compute bivariate line fit 318 319 Parameters 320 ---------- 321 method : str 322 line fitting method: sma (default), ols, wls, York, sen, siegel 323 intercept : bool 324 defines whether non-zero intercept should be fitted 325 **kwargs 326 passed to `acgc.stats.sma` (e.g. robust=True) 327 328 Returns 329 ------- 330 result : dict 331 dictionary with keys: 332 - slope (float) 333 slope of fitted line 334 - intercept (float) 335 intercept of fitted line 336 - fittedvalues (array (N,)) 337 values on fit line 338 - residuals (array (N,)) 339 residual from fit line 340 ''' 341 342 fitintercept = intercept 343 344 if method.lower()=='sma': 345 fit = sma( self._x, 346 self._y, 347 self._w, 348 intercept=fitintercept, 349 **kwargs) 350 slope = fit['slope'] 351 intercept= fit['intercept'] 352 353 elif method.lower()=='ols': 354 if fitintercept: 355 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 356 self._y, rcond=None ) 357 else: 358 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 359 slope = ols[0][0] 360 intercept = ols[0][1] 361 362 elif method.lower() in ['theil','sen','theilsen']: 363 fitintercept = True 364 fit = sen( self._x, 365 self._y, 366 **kwargs) 367 slope = fit.slope 368 intercept = fit.intercept 369 370 elif method.lower()=='siegel': 371 fitintercept = True 372 siegel = stats.siegelslopes( self._x, 373 self._y ) 374 slope = siegel.slope 375 intercept = siegel.intercept 376 377 elif method.lower()=='wls': 378 raise NotImplementedError('WLS regression not implemented yet') 379 380 elif method.lower()=='york': 381 raise NotImplementedError('York regression not implemented yet') 382 383 else: 384 raise ValueError('Undefined method '+method) 385 386 line = dict( slope = slope, 387 intercept = intercept, 388 fittedvalues = slope * self._x + intercept, 389 residuals = self._y - ( slope * self._x + intercept ), 390 method = method, 391 fitintercept = fitintercept ) 392 393 return line 394 395 def slope(self,method='sma',intercept=True,**kwargs): 396 '''Compute slope of bivariate line fit 397 398 Parameters 399 ---------- 400 method : str 401 line fitting method: sma (default), ols, wls 402 intercept : bool 403 defines whether non-zero intercept should be fitted 404 **kwargs 405 passed to `fitline` 406 407 Returns 408 ------- 409 slope : float 410 value of y intercept 411 ''' 412 return self.fitline(method,intercept,**kwargs)['slope'] 413 414 def intercept(self,method='sma',intercept=True,**kwargs): 415 '''Compute intercept of bivariate line fit 416 417 Parameters 418 ---------- 419 method : str 420 line fitting method: sma (default) or ols 421 intercept : bool 422 defines whether non-zero intercept should be fitted 423 **kwargs 424 passed to `fitline` 425 426 Returns 427 ------- 428 intercept : float 429 value of y intercept 430 ''' 431 return self.fitline(method,intercept,**kwargs)['intercept'] 432 433 def _expand_variables(self,variables): 434 '''Expand special strings into a list of variables 435 436 Parameter 437 --------- 438 variables : list or str, default='common' 439 Special strings ("all","common") will be expanded to a list of variables 440 list arguments will not be modified 441 442 Returns 443 ------- 444 list 445 variable names 446 ''' 447 if variables is None: 448 variables='common' 449 if variables=='all': 450 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 451 'MedD','MedAD','RMedD','RMedAD','MedRD', 452 'NMBF','NMAEF','RMSD', 453 'R','R2','spearmanr','slope','intercept', 454 'fitline'] 455 elif variables=='common': 456 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope'] 457 if not isinstance(variables,list): 458 raise ValueError( 459 'variables must be a list, None, or one of these strings: "all","common"') 460 461 return variables 462 463 def summary_dict(self, variables=None, 464 fitline_kw=None, 465 floatformat_fiteqn='{:.3f}' ): 466 '''Summarize bivariate statistics into a dict 467 468 Parameters 469 ---------- 470 vars : list or str, default='common' 471 names of attribute variables to include in summary 472 names are case insensitive 473 The following strings are also accepted in place of a list 474 "all" (displays all variables) 475 "common" (displays all measures of mean difference) 476 fitline_kw : dict, default=None) 477 keywords passed to self.fitline() 478 479 Returns 480 ------- 481 summary : dict 482 names and values of variables 483 ''' 484 485 # List of variables 486 variables = self._expand_variables(variables) 487 488 if fitline_kw is None: 489 fitline_kw = {'method':'sma', 490 'intercept':True} 491 492 # Construct the dict 493 summary = {} 494 for v in variables: 495 if v in ['slope','intercept']: 496 # These variables are object methods 497 func = getattr(self,v) 498 value = func(**fitline_kw) 499 elif v == 'fitline': 500 line = self.fitline(**fitline_kw) 501 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 502 else: 503 # Retrieve values 504 value = getattr(self,v.lower()) 505 506 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 507 summary[v] = value 508 509 return summary 510 511 def summary(self, variables=None, fitline_kw=None, 512 floatformat='{:.4f}', floatformat_fiteqn=None, 513 stringlength=None ): 514 '''Summarize bivariate statistics 515 516 Parameters 517 ---------- 518 vars : list or str, default='common' 519 names of attribute variables to include in summary 520 names are case insensitive 521 The following strings are also accepted in place of a list 522 "all" (displays all variables) 523 "common" (displays all measures of mean difference) 524 floatformat : str, default='{:.4f}' 525 format specifier for floating point values 526 floatformat_fiteqn : str, default=floatformat 527 format specifier for slope and intercept (a,b) in y = a x + b 528 stringlength : int, default=None 529 length of the variables on output 530 default (None) is to use the length of the longest variable name 531 fitline_kw : dict, default=None 532 keywords passed to `fitline` 533 534 Returns 535 ------- 536 summary : str 537 names and values of variables 538 ''' 539 # List of variables 540 variables = self._expand_variables(variables) 541 542 if floatformat_fiteqn is None: 543 floatformat_fiteqn = floatformat 544 if stringlength is None: 545 stringlength = np.max([len(v) for v in variables]) 546 stringformat = '{:'+str(stringlength)+'s}' 547 548 # Get a dict containing the needed variables 549 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 550 551 # Extract length of the float numbers from floatformat 552 # import re 553 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 554 # floatformat )[0] ) ).astype(int) 555 556 # summary = (stringformat+'{:>10s}').format('Variable','Value') 557 summarytext = '' 558 for k,v in summarydict.items(): 559 if isinstance(v,str): 560 summarytext += (stringformat+' = {:s}\n').format(k,v) 561 else: 562 summarytext += (stringformat+' = '+floatformat+'\n').format(k,v) 563 564 return summarytext 565 566 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 567 floatformat='{:.3f}', floatformat_fiteqn=None, 568 loc=None, loc_units='axes', 569 **kwargs): 570 '''Display bivariate statistics as a table inset on a plot axis 571 572 Parameters 573 ---------- 574 ax : matplotlib.Figure.Axis 575 axis where the table will be displayed 576 variables : list or str, default='common' 577 names of attribute variables to include in summary 578 names are case insensitive 579 The following strings are also accepted in place of a list 580 "all" (displays all variables) 581 "common" (displays all measures of mean difference) 582 fitline_kw : dict, default=None 583 keywords passed to `fitline` 584 floatformat : str, default='{:.3f}' 585 format specifier for floating point values 586 floatformat_fiteqn : str, default=floatformat 587 format specifier for slope and intercept (a,b) in y = a x + b 588 loc : tuple (x0,y0), default=(0.85, 0.05) 589 location on the axis where the table will be drawn 590 can be in data units or axes units [0-1] 591 loc_units : {'axes' (default), 'data'} 592 specifies whether loc has 'data' units or 'axes' units [0-1] 593 594 Returns 595 ------- 596 text1, text2 : matplotlib text object 597 Artist for the two text boxes 598 ''' 599 # List of variables 600 variables = self._expand_variables(variables) 601 602 if floatformat_fiteqn is None: 603 floatformat_fiteqn = floatformat 604 605 # Default location in lower right corner 606 if loc is None: 607 loc = (0.8,0.05) 608 609 # Coordinates for loc 610 if loc_units.lower()=='data': 611 coord=ax.transData 612 elif loc_units.lower() in ['axes','axis']: 613 coord=ax.transAxes 614 else: 615 raise ValueError('Display units should be "Data" or "Axes"') 616 617 # Get a dict containing the needed variables 618 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 619 620 # Column of label text 621 label_text = '\n'.join([_texify_name(key) for key in summarydict]) 622 # Column of value text 623 value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value) 624 for value in summarydict.values()]) 625 626 # Check if horizontal alignment keyword is used 627 ha='' 628 try: 629 ha = kwargs['ha'] 630 except KeyError: 631 pass 632 try: 633 ha = kwargs['horizontalalignment'] 634 except KeyError: 635 pass 636 637 # For right alignment, align on values first 638 # Otherwise, align on labels 639 if ha=='right': 640 first_text = value_text 641 second_text = label_text 642 sign = -1 643 else: 644 first_text = label_text 645 second_text = value_text 646 sign = +1 647 648 # Add first column of text 649 t1=ax.text(loc[0],loc[1], 650 first_text, 651 transform=coord, 652 **kwargs 653 ) 654 655 # Get width of first text column 656 bbox = t1.get_window_extent().transformed(coord.inverted()) 657 width = bbox.x1-bbox.x0 658 659 # Add second column of text 660 t2 = ax.text(loc[0]+width*sign,loc[1], 661 second_text, 662 transform=coord, 663 **kwargs 664 ) 665 666 ################################## 667 # Early version of this function using matplotlib.table.table() 668 669 # if isinstance(loc,(tuple,list)): 670 # # Create an inset axis to contain the table 671 # tableaxis = ax.inset_axes(loc) 672 # table_width=1 673 # else: 674 # tableaxis = ax 675 676 # # Display the table on the axis 677 # return mtable.table( 678 # tableaxis, 679 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 680 # rowLabels=[texify_name(key) for key in summarydict], 681 # colWidths=[table_width/2]*2, 682 # edges=edges, 683 # loc=loc, bbox=bbox 684 # ) 685 686 return [t1,t2]
160class BivariateStatistics: 161 '''A suite of common statistics to quantify bivariate relationships 162 163 Class method 'summary' provides a formatted summary of these statistics 164 165 Attributes 166 ---------- 167 xmean, ymean : float 168 mean of x and y variables 169 xmedian, ymedian :float 170 median of x and y variables 171 xstd, ystd : float 172 standard deviation of x and y variables 173 mean_difference, md : float 174 ymean - xmean 175 mean_absolute_difference, mad : float 176 mean( |y-x| ) 177 relative_mean_difference, rmd : float 178 md / xmean 179 relative_mean_absolute_difference, rmad :float 180 mad / xmean 181 standardized_mean_difference, smd : float 182 md / xstd 183 standardized_mean_absolute_difference, smad : float 184 mad /xstd 185 mean_relative_difference, mrd : float 186 mean(y/x) - 1 187 median_difference, medd : float 188 median(y-x) 189 median_absolute_difference, medad : float 190 median(|y-x|) 191 relative_median_difference, rmedd : float 192 median(y-x) / xmedian 193 relative_median_absolute_difference, rmedad : float 194 median(|y-x|) / xmedian 195 median_relative_difference, medianrd, medrd : float 196 median(y/x)-1 197 normalized_mean_bias_factor, nmbf : float 198 see `nmbf` 199 normalized_mean_absolute_error_factor, nmaef : float 200 see `nmaef` 201 root_mean_square_difference, rmsd : float 202 $\\sqrt{ \\langle (y - x)^2 \\rangle }$ 203 covariance : float 204 cov(x,y) 205 correlation_pearson, correlation, pearsonr, R, r : float 206 Pearson linear correlation coefficient 207 correlation_spearman, spearmanr : float 208 Spearman, non-parametric rank correlation coefficient 209 R2, r2 : float 210 Linear coefficient of determination, $R^2$ 211 ''' 212 213 def __init__(self,x,y,w=None,dropna=False,data=None): 214 '''Compute suite of bivariate statistics during initialization 215 216 Statistic values are saved in attributes. 217 CAUTION: Weights w are ignored except in SMA fit 218 219 Parameters 220 ---------- 221 x : ndarray or str 222 independent variable values 223 y : ndarray or str 224 dependent variable values, same size as x 225 w : ndarray or str, optional 226 weights for points (x,y), same size as x and y 227 dropna : bool, optional (default=False) 228 drops NaN values from x, y, and w 229 data : dict-like, optional 230 if x, y, or w are str, then they should be keys in data 231 ''' 232 233 # Get values from data if needed 234 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 235 raise ValueError( 'Data argument must be used if x, y, or w is a string') 236 if isinstance(x,str): 237 x = data[x] 238 if isinstance(y,str): 239 y = data[y] 240 if isinstance(w,str): 241 w = data[w] 242 243 #Ensure that x and y have same length 244 if len(x) != len(y): 245 raise ValueError( 'Arguments x and y must have the same length' ) 246 if w is None: 247 w = np.ones_like(x) 248 if len(w) != len(x): 249 raise ValueError( 'Argument w (if present) must have the same length as x' ) 250 251 # Drop NaN values 252 if dropna: 253 isna = np.isnan(x*y*w) 254 x = x[~isna] 255 y = y[~isna] 256 w = w[~isna] 257 258 diff = y - x 259 absdiff = np.abs( y - x ) 260 # Ignore divide by zero and 0/0 while dividing 261 old_settings = np.seterr(divide='ignore',invalid='ignore') 262 ratio = y/x 263 np.seterr(**old_settings) 264 265 # Means, medians, and standard deviations 266 self.xmean = np.mean(x) 267 self.ymean = np.mean(y) 268 self.xmedian = np.median(x) 269 self.ymedian = np.median(y) 270 self.xstd = np.std(x) 271 self.ystd = np.std(y) 272 273 # Save values for use later 274 self._x = x 275 self._y = y 276 self._w = w 277 278 # Mean and mean absolute differences 279 self.mean_difference = self.md = self.ymean - self.xmean 280 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 281 282 # Relative and standardized differences 283 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 284 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 285 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 286 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 287 288 # Mean and median relative differences 289 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 290 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 291 292 # Median and median absolute differences 293 self.median_difference = self.medd = np.median( diff ) 294 self.median_absolute_difference = self.medad = np.median( absdiff ) 295 296 # Relative median differences 297 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 298 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 299 300 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 301 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 302 303 # RMS difference 304 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 305 306 # Covariance, correlation 307 self.covariance = np.cov(x,y)[0][1] 308 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 309 np.corrcoef(x,y)[0][1] 310 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 311 self.R2 = self.r2 = self.R**2 312 313 def __getitem__(self,key): 314 '''Accesses attribute values via object['key']''' 315 return getattr(self,key) 316 317 def fitline(self,method='sma',intercept=True,**kwargs): 318 '''Compute bivariate line fit 319 320 Parameters 321 ---------- 322 method : str 323 line fitting method: sma (default), ols, wls, York, sen, siegel 324 intercept : bool 325 defines whether non-zero intercept should be fitted 326 **kwargs 327 passed to `acgc.stats.sma` (e.g. robust=True) 328 329 Returns 330 ------- 331 result : dict 332 dictionary with keys: 333 - slope (float) 334 slope of fitted line 335 - intercept (float) 336 intercept of fitted line 337 - fittedvalues (array (N,)) 338 values on fit line 339 - residuals (array (N,)) 340 residual from fit line 341 ''' 342 343 fitintercept = intercept 344 345 if method.lower()=='sma': 346 fit = sma( self._x, 347 self._y, 348 self._w, 349 intercept=fitintercept, 350 **kwargs) 351 slope = fit['slope'] 352 intercept= fit['intercept'] 353 354 elif method.lower()=='ols': 355 if fitintercept: 356 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 357 self._y, rcond=None ) 358 else: 359 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 360 slope = ols[0][0] 361 intercept = ols[0][1] 362 363 elif method.lower() in ['theil','sen','theilsen']: 364 fitintercept = True 365 fit = sen( self._x, 366 self._y, 367 **kwargs) 368 slope = fit.slope 369 intercept = fit.intercept 370 371 elif method.lower()=='siegel': 372 fitintercept = True 373 siegel = stats.siegelslopes( self._x, 374 self._y ) 375 slope = siegel.slope 376 intercept = siegel.intercept 377 378 elif method.lower()=='wls': 379 raise NotImplementedError('WLS regression not implemented yet') 380 381 elif method.lower()=='york': 382 raise NotImplementedError('York regression not implemented yet') 383 384 else: 385 raise ValueError('Undefined method '+method) 386 387 line = dict( slope = slope, 388 intercept = intercept, 389 fittedvalues = slope * self._x + intercept, 390 residuals = self._y - ( slope * self._x + intercept ), 391 method = method, 392 fitintercept = fitintercept ) 393 394 return line 395 396 def slope(self,method='sma',intercept=True,**kwargs): 397 '''Compute slope of bivariate line fit 398 399 Parameters 400 ---------- 401 method : str 402 line fitting method: sma (default), ols, wls 403 intercept : bool 404 defines whether non-zero intercept should be fitted 405 **kwargs 406 passed to `fitline` 407 408 Returns 409 ------- 410 slope : float 411 value of y intercept 412 ''' 413 return self.fitline(method,intercept,**kwargs)['slope'] 414 415 def intercept(self,method='sma',intercept=True,**kwargs): 416 '''Compute intercept of bivariate line fit 417 418 Parameters 419 ---------- 420 method : str 421 line fitting method: sma (default) or ols 422 intercept : bool 423 defines whether non-zero intercept should be fitted 424 **kwargs 425 passed to `fitline` 426 427 Returns 428 ------- 429 intercept : float 430 value of y intercept 431 ''' 432 return self.fitline(method,intercept,**kwargs)['intercept'] 433 434 def _expand_variables(self,variables): 435 '''Expand special strings into a list of variables 436 437 Parameter 438 --------- 439 variables : list or str, default='common' 440 Special strings ("all","common") will be expanded to a list of variables 441 list arguments will not be modified 442 443 Returns 444 ------- 445 list 446 variable names 447 ''' 448 if variables is None: 449 variables='common' 450 if variables=='all': 451 variables=['MD','MAD','RMD','RMAD','MRD','SMD','SMAD', 452 'MedD','MedAD','RMedD','RMedAD','MedRD', 453 'NMBF','NMAEF','RMSD', 454 'R','R2','spearmanr','slope','intercept', 455 'fitline'] 456 elif variables=='common': 457 variables=['MD','MAD','RMD','RMAD','MRD','R2','slope'] 458 if not isinstance(variables,list): 459 raise ValueError( 460 'variables must be a list, None, or one of these strings: "all","common"') 461 462 return variables 463 464 def summary_dict(self, variables=None, 465 fitline_kw=None, 466 floatformat_fiteqn='{:.3f}' ): 467 '''Summarize bivariate statistics into a dict 468 469 Parameters 470 ---------- 471 vars : list or str, default='common' 472 names of attribute variables to include in summary 473 names are case insensitive 474 The following strings are also accepted in place of a list 475 "all" (displays all variables) 476 "common" (displays all measures of mean difference) 477 fitline_kw : dict, default=None) 478 keywords passed to self.fitline() 479 480 Returns 481 ------- 482 summary : dict 483 names and values of variables 484 ''' 485 486 # List of variables 487 variables = self._expand_variables(variables) 488 489 if fitline_kw is None: 490 fitline_kw = {'method':'sma', 491 'intercept':True} 492 493 # Construct the dict 494 summary = {} 495 for v in variables: 496 if v in ['slope','intercept']: 497 # These variables are object methods 498 func = getattr(self,v) 499 value = func(**fitline_kw) 500 elif v == 'fitline': 501 line = self.fitline(**fitline_kw) 502 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 503 else: 504 # Retrieve values 505 value = getattr(self,v.lower()) 506 507 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 508 summary[v] = value 509 510 return summary 511 512 def summary(self, variables=None, fitline_kw=None, 513 floatformat='{:.4f}', floatformat_fiteqn=None, 514 stringlength=None ): 515 '''Summarize bivariate statistics 516 517 Parameters 518 ---------- 519 vars : list or str, default='common' 520 names of attribute variables to include in summary 521 names are case insensitive 522 The following strings are also accepted in place of a list 523 "all" (displays all variables) 524 "common" (displays all measures of mean difference) 525 floatformat : str, default='{:.4f}' 526 format specifier for floating point values 527 floatformat_fiteqn : str, default=floatformat 528 format specifier for slope and intercept (a,b) in y = a x + b 529 stringlength : int, default=None 530 length of the variables on output 531 default (None) is to use the length of the longest variable name 532 fitline_kw : dict, default=None 533 keywords passed to `fitline` 534 535 Returns 536 ------- 537 summary : str 538 names and values of variables 539 ''' 540 # List of variables 541 variables = self._expand_variables(variables) 542 543 if floatformat_fiteqn is None: 544 floatformat_fiteqn = floatformat 545 if stringlength is None: 546 stringlength = np.max([len(v) for v in variables]) 547 stringformat = '{:'+str(stringlength)+'s}' 548 549 # Get a dict containing the needed variables 550 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 551 552 # Extract length of the float numbers from floatformat 553 # import re 554 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 555 # floatformat )[0] ) ).astype(int) 556 557 # summary = (stringformat+'{:>10s}').format('Variable','Value') 558 summarytext = '' 559 for k,v in summarydict.items(): 560 if isinstance(v,str): 561 summarytext += (stringformat+' = {:s}\n').format(k,v) 562 else: 563 summarytext += (stringformat+' = '+floatformat+'\n').format(k,v) 564 565 return summarytext 566 567 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 568 floatformat='{:.3f}', floatformat_fiteqn=None, 569 loc=None, loc_units='axes', 570 **kwargs): 571 '''Display bivariate statistics as a table inset on a plot axis 572 573 Parameters 574 ---------- 575 ax : matplotlib.Figure.Axis 576 axis where the table will be displayed 577 variables : list or str, default='common' 578 names of attribute variables to include in summary 579 names are case insensitive 580 The following strings are also accepted in place of a list 581 "all" (displays all variables) 582 "common" (displays all measures of mean difference) 583 fitline_kw : dict, default=None 584 keywords passed to `fitline` 585 floatformat : str, default='{:.3f}' 586 format specifier for floating point values 587 floatformat_fiteqn : str, default=floatformat 588 format specifier for slope and intercept (a,b) in y = a x + b 589 loc : tuple (x0,y0), default=(0.85, 0.05) 590 location on the axis where the table will be drawn 591 can be in data units or axes units [0-1] 592 loc_units : {'axes' (default), 'data'} 593 specifies whether loc has 'data' units or 'axes' units [0-1] 594 595 Returns 596 ------- 597 text1, text2 : matplotlib text object 598 Artist for the two text boxes 599 ''' 600 # List of variables 601 variables = self._expand_variables(variables) 602 603 if floatformat_fiteqn is None: 604 floatformat_fiteqn = floatformat 605 606 # Default location in lower right corner 607 if loc is None: 608 loc = (0.8,0.05) 609 610 # Coordinates for loc 611 if loc_units.lower()=='data': 612 coord=ax.transData 613 elif loc_units.lower() in ['axes','axis']: 614 coord=ax.transAxes 615 else: 616 raise ValueError('Display units should be "Data" or "Axes"') 617 618 # Get a dict containing the needed variables 619 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 620 621 # Column of label text 622 label_text = '\n'.join([_texify_name(key) for key in summarydict]) 623 # Column of value text 624 value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value) 625 for value in summarydict.values()]) 626 627 # Check if horizontal alignment keyword is used 628 ha='' 629 try: 630 ha = kwargs['ha'] 631 except KeyError: 632 pass 633 try: 634 ha = kwargs['horizontalalignment'] 635 except KeyError: 636 pass 637 638 # For right alignment, align on values first 639 # Otherwise, align on labels 640 if ha=='right': 641 first_text = value_text 642 second_text = label_text 643 sign = -1 644 else: 645 first_text = label_text 646 second_text = value_text 647 sign = +1 648 649 # Add first column of text 650 t1=ax.text(loc[0],loc[1], 651 first_text, 652 transform=coord, 653 **kwargs 654 ) 655 656 # Get width of first text column 657 bbox = t1.get_window_extent().transformed(coord.inverted()) 658 width = bbox.x1-bbox.x0 659 660 # Add second column of text 661 t2 = ax.text(loc[0]+width*sign,loc[1], 662 second_text, 663 transform=coord, 664 **kwargs 665 ) 666 667 ################################## 668 # Early version of this function using matplotlib.table.table() 669 670 # if isinstance(loc,(tuple,list)): 671 # # Create an inset axis to contain the table 672 # tableaxis = ax.inset_axes(loc) 673 # table_width=1 674 # else: 675 # tableaxis = ax 676 677 # # Display the table on the axis 678 # return mtable.table( 679 # tableaxis, 680 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 681 # rowLabels=[texify_name(key) for key in summarydict], 682 # colWidths=[table_width/2]*2, 683 # edges=edges, 684 # loc=loc, bbox=bbox 685 # ) 686 687 return [t1,t2]
A suite of common statistics to quantify bivariate relationships
Class method 'summary' provides a formatted summary of these statistics
Attributes
- xmean, ymean (float): mean of x and y variables
- xmedian, ymedian (float): median of x and y variables
- xstd, ystd (float): standard deviation of x and y variables
- mean_difference, md (float): ymean - xmean
- mean_absolute_difference, mad (float): mean( |y-x| )
- relative_mean_difference, rmd (float): md / xmean
- relative_mean_absolute_difference, rmad (float): mad / xmean
- standardized_mean_difference, smd (float): md / xstd
- standardized_mean_absolute_difference, smad (float): mad /xstd
- mean_relative_difference, mrd (float): mean(y/x) - 1
- median_difference, medd (float): median(y-x)
- median_absolute_difference, medad (float): median(|y-x|)
- relative_median_difference, rmedd (float): median(y-x) / xmedian
- relative_median_absolute_difference, rmedad (float): median(|y-x|) / xmedian
- median_relative_difference, medianrd, medrd (float): median(y/x)-1
- normalized_mean_bias_factor, nmbf (float):
see
nmbf
- normalized_mean_absolute_error_factor, nmaef (float):
see
nmaef
- root_mean_square_difference, rmsd (float): $\sqrt{ \langle (y - x)^2 \rangle }$
- covariance (float): cov(x,y)
- correlation_pearson, correlation, pearsonr, R, r (float): Pearson linear correlation coefficient
- correlation_spearman, spearmanr (float): Spearman, non-parametric rank correlation coefficient
- R2, r2 (float): Linear coefficient of determination, $R^2$
213 def __init__(self,x,y,w=None,dropna=False,data=None): 214 '''Compute suite of bivariate statistics during initialization 215 216 Statistic values are saved in attributes. 217 CAUTION: Weights w are ignored except in SMA fit 218 219 Parameters 220 ---------- 221 x : ndarray or str 222 independent variable values 223 y : ndarray or str 224 dependent variable values, same size as x 225 w : ndarray or str, optional 226 weights for points (x,y), same size as x and y 227 dropna : bool, optional (default=False) 228 drops NaN values from x, y, and w 229 data : dict-like, optional 230 if x, y, or w are str, then they should be keys in data 231 ''' 232 233 # Get values from data if needed 234 if data is None and (isinstance(x,str) or isinstance(y,str) or isinstance(w,str)): 235 raise ValueError( 'Data argument must be used if x, y, or w is a string') 236 if isinstance(x,str): 237 x = data[x] 238 if isinstance(y,str): 239 y = data[y] 240 if isinstance(w,str): 241 w = data[w] 242 243 #Ensure that x and y have same length 244 if len(x) != len(y): 245 raise ValueError( 'Arguments x and y must have the same length' ) 246 if w is None: 247 w = np.ones_like(x) 248 if len(w) != len(x): 249 raise ValueError( 'Argument w (if present) must have the same length as x' ) 250 251 # Drop NaN values 252 if dropna: 253 isna = np.isnan(x*y*w) 254 x = x[~isna] 255 y = y[~isna] 256 w = w[~isna] 257 258 diff = y - x 259 absdiff = np.abs( y - x ) 260 # Ignore divide by zero and 0/0 while dividing 261 old_settings = np.seterr(divide='ignore',invalid='ignore') 262 ratio = y/x 263 np.seterr(**old_settings) 264 265 # Means, medians, and standard deviations 266 self.xmean = np.mean(x) 267 self.ymean = np.mean(y) 268 self.xmedian = np.median(x) 269 self.ymedian = np.median(y) 270 self.xstd = np.std(x) 271 self.ystd = np.std(y) 272 273 # Save values for use later 274 self._x = x 275 self._y = y 276 self._w = w 277 278 # Mean and mean absolute differences 279 self.mean_difference = self.md = self.ymean - self.xmean 280 self.mean_absolute_difference = self.mad = np.mean( absdiff ) 281 282 # Relative and standardized differences 283 self.relative_mean_difference = self.rmd = self.mean_difference / self.xmean 284 self.relative_mean_absolute_difference = self.rmad = self.mean_absolute_difference / self.xmean 285 self.standardized_mean_difference = self.smd = self.mean_difference / self.xstd 286 self.standardized_mean_absolute_difference = self.smad = self.mean_absolute_difference / self.xstd 287 288 # Mean and median relative differences 289 self.mean_relative_difference = self.mrd = np.mean( ratio - 1 ) 290 self.median_relative_difference = self.medianrd = self.medrd = np.median( ratio - 1 ) 291 292 # Median and median absolute differences 293 self.median_difference = self.medd = np.median( diff ) 294 self.median_absolute_difference = self.medad = np.median( absdiff ) 295 296 # Relative median differences 297 self.relative_median_difference = self.rmedd = self.median_difference / self.xmedian 298 self.relative_median_absolute_difference = self.rmedad = self.median_absolute_difference / self.xmedian 299 300 self.normalized_mean_bias_factor = self.nmbf = nmbf(x,y) 301 self.normalized_mean_absolute_error_factor = self.nmaef = nmaef(x,y) 302 303 # RMS difference 304 self.root_mean_square_difference = self.rmsd = np.sqrt( np.mean( np.power( diff, 2) ) ) 305 306 # Covariance, correlation 307 self.covariance = np.cov(x,y)[0][1] 308 self.correlation = self.correlation_pearson = self.R = self.r = self.pearsonr = \ 309 np.corrcoef(x,y)[0][1] 310 self.correlation_spearman = self.spearmanr = stats.spearmanr(x,y).statistic 311 self.R2 = self.r2 = self.R**2
Compute suite of bivariate statistics during initialization
Statistic values are saved in attributes. CAUTION: Weights w are ignored except in SMA fit
Parameters
- x (ndarray or str): independent variable values
- y (ndarray or str): dependent variable values, same size as x
- w (ndarray or str, optional): weights for points (x,y), same size as x and y
- dropna (bool, optional (default=False)): drops NaN values from x, y, and w
- data (dict-like, optional): if x, y, or w are str, then they should be keys in data
317 def fitline(self,method='sma',intercept=True,**kwargs): 318 '''Compute bivariate line fit 319 320 Parameters 321 ---------- 322 method : str 323 line fitting method: sma (default), ols, wls, York, sen, siegel 324 intercept : bool 325 defines whether non-zero intercept should be fitted 326 **kwargs 327 passed to `acgc.stats.sma` (e.g. robust=True) 328 329 Returns 330 ------- 331 result : dict 332 dictionary with keys: 333 - slope (float) 334 slope of fitted line 335 - intercept (float) 336 intercept of fitted line 337 - fittedvalues (array (N,)) 338 values on fit line 339 - residuals (array (N,)) 340 residual from fit line 341 ''' 342 343 fitintercept = intercept 344 345 if method.lower()=='sma': 346 fit = sma( self._x, 347 self._y, 348 self._w, 349 intercept=fitintercept, 350 **kwargs) 351 slope = fit['slope'] 352 intercept= fit['intercept'] 353 354 elif method.lower()=='ols': 355 if fitintercept: 356 ols = np.linalg.lstsq( np.vstack([self._x,np.ones(len(self._x))]).T, 357 self._y, rcond=None ) 358 else: 359 ols = np.linalg.lstsq( np.vstack([self._x]).T, self._y, rcond=None ) 360 slope = ols[0][0] 361 intercept = ols[0][1] 362 363 elif method.lower() in ['theil','sen','theilsen']: 364 fitintercept = True 365 fit = sen( self._x, 366 self._y, 367 **kwargs) 368 slope = fit.slope 369 intercept = fit.intercept 370 371 elif method.lower()=='siegel': 372 fitintercept = True 373 siegel = stats.siegelslopes( self._x, 374 self._y ) 375 slope = siegel.slope 376 intercept = siegel.intercept 377 378 elif method.lower()=='wls': 379 raise NotImplementedError('WLS regression not implemented yet') 380 381 elif method.lower()=='york': 382 raise NotImplementedError('York regression not implemented yet') 383 384 else: 385 raise ValueError('Undefined method '+method) 386 387 line = dict( slope = slope, 388 intercept = intercept, 389 fittedvalues = slope * self._x + intercept, 390 residuals = self._y - ( slope * self._x + intercept ), 391 method = method, 392 fitintercept = fitintercept ) 393 394 return line
Compute bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls, York, sen, siegel
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
acgc.stats.sma
(e.g. robust=True)
Returns
- result (dict):
dictionary with keys:
- slope (float) slope of fitted line
- intercept (float) intercept of fitted line
- fittedvalues (array (N,)) values on fit line
- residuals (array (N,)) residual from fit line
396 def slope(self,method='sma',intercept=True,**kwargs): 397 '''Compute slope of bivariate line fit 398 399 Parameters 400 ---------- 401 method : str 402 line fitting method: sma (default), ols, wls 403 intercept : bool 404 defines whether non-zero intercept should be fitted 405 **kwargs 406 passed to `fitline` 407 408 Returns 409 ------- 410 slope : float 411 value of y intercept 412 ''' 413 return self.fitline(method,intercept,**kwargs)['slope']
Compute slope of bivariate line fit
Parameters
- method (str): line fitting method: sma (default), ols, wls
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- slope (float): value of y intercept
415 def intercept(self,method='sma',intercept=True,**kwargs): 416 '''Compute intercept of bivariate line fit 417 418 Parameters 419 ---------- 420 method : str 421 line fitting method: sma (default) or ols 422 intercept : bool 423 defines whether non-zero intercept should be fitted 424 **kwargs 425 passed to `fitline` 426 427 Returns 428 ------- 429 intercept : float 430 value of y intercept 431 ''' 432 return self.fitline(method,intercept,**kwargs)['intercept']
Compute intercept of bivariate line fit
Parameters
- method (str): line fitting method: sma (default) or ols
- intercept (bool): defines whether non-zero intercept should be fitted
- **kwargs: passed to
fitline
Returns
- intercept (float): value of y intercept
464 def summary_dict(self, variables=None, 465 fitline_kw=None, 466 floatformat_fiteqn='{:.3f}' ): 467 '''Summarize bivariate statistics into a dict 468 469 Parameters 470 ---------- 471 vars : list or str, default='common' 472 names of attribute variables to include in summary 473 names are case insensitive 474 The following strings are also accepted in place of a list 475 "all" (displays all variables) 476 "common" (displays all measures of mean difference) 477 fitline_kw : dict, default=None) 478 keywords passed to self.fitline() 479 480 Returns 481 ------- 482 summary : dict 483 names and values of variables 484 ''' 485 486 # List of variables 487 variables = self._expand_variables(variables) 488 489 if fitline_kw is None: 490 fitline_kw = {'method':'sma', 491 'intercept':True} 492 493 # Construct the dict 494 summary = {} 495 for v in variables: 496 if v in ['slope','intercept']: 497 # These variables are object methods 498 func = getattr(self,v) 499 value = func(**fitline_kw) 500 elif v == 'fitline': 501 line = self.fitline(**fitline_kw) 502 v,value = bivariate_line_equation(line,floatformat_fiteqn,ystring='separate') 503 else: 504 # Retrieve values 505 value = getattr(self,v.lower()) 506 507 # summary += (stringformat+'='+floatformat+'\n').format(v,value) 508 summary[v] = value 509 510 return summary
Summarize bivariate statistics into a dict
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None)): keywords passed to self.fitline()
Returns
- summary (dict): names and values of variables
512 def summary(self, variables=None, fitline_kw=None, 513 floatformat='{:.4f}', floatformat_fiteqn=None, 514 stringlength=None ): 515 '''Summarize bivariate statistics 516 517 Parameters 518 ---------- 519 vars : list or str, default='common' 520 names of attribute variables to include in summary 521 names are case insensitive 522 The following strings are also accepted in place of a list 523 "all" (displays all variables) 524 "common" (displays all measures of mean difference) 525 floatformat : str, default='{:.4f}' 526 format specifier for floating point values 527 floatformat_fiteqn : str, default=floatformat 528 format specifier for slope and intercept (a,b) in y = a x + b 529 stringlength : int, default=None 530 length of the variables on output 531 default (None) is to use the length of the longest variable name 532 fitline_kw : dict, default=None 533 keywords passed to `fitline` 534 535 Returns 536 ------- 537 summary : str 538 names and values of variables 539 ''' 540 # List of variables 541 variables = self._expand_variables(variables) 542 543 if floatformat_fiteqn is None: 544 floatformat_fiteqn = floatformat 545 if stringlength is None: 546 stringlength = np.max([len(v) for v in variables]) 547 stringformat = '{:'+str(stringlength)+'s}' 548 549 # Get a dict containing the needed variables 550 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 551 552 # Extract length of the float numbers from floatformat 553 # import re 554 # floatlength = np.floor( float( re.findall("[-+]?(?:\d*\.*\d+)", 555 # floatformat )[0] ) ).astype(int) 556 557 # summary = (stringformat+'{:>10s}').format('Variable','Value') 558 summarytext = '' 559 for k,v in summarydict.items(): 560 if isinstance(v,str): 561 summarytext += (stringformat+' = {:s}\n').format(k,v) 562 else: 563 summarytext += (stringformat+' = '+floatformat+'\n').format(k,v) 564 565 return summarytext
Summarize bivariate statistics
Parameters
- vars (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - floatformat : str, default='{ (.4f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- stringlength (int, default=None): length of the variables on output default (None) is to use the length of the longest variable name
- fitline_kw (dict, default=None):
keywords passed to
fitline
Returns
- summary (str): names and values of variables
567 def summary_fig_inset(self, ax, variables=None, fitline_kw=None, 568 floatformat='{:.3f}', floatformat_fiteqn=None, 569 loc=None, loc_units='axes', 570 **kwargs): 571 '''Display bivariate statistics as a table inset on a plot axis 572 573 Parameters 574 ---------- 575 ax : matplotlib.Figure.Axis 576 axis where the table will be displayed 577 variables : list or str, default='common' 578 names of attribute variables to include in summary 579 names are case insensitive 580 The following strings are also accepted in place of a list 581 "all" (displays all variables) 582 "common" (displays all measures of mean difference) 583 fitline_kw : dict, default=None 584 keywords passed to `fitline` 585 floatformat : str, default='{:.3f}' 586 format specifier for floating point values 587 floatformat_fiteqn : str, default=floatformat 588 format specifier for slope and intercept (a,b) in y = a x + b 589 loc : tuple (x0,y0), default=(0.85, 0.05) 590 location on the axis where the table will be drawn 591 can be in data units or axes units [0-1] 592 loc_units : {'axes' (default), 'data'} 593 specifies whether loc has 'data' units or 'axes' units [0-1] 594 595 Returns 596 ------- 597 text1, text2 : matplotlib text object 598 Artist for the two text boxes 599 ''' 600 # List of variables 601 variables = self._expand_variables(variables) 602 603 if floatformat_fiteqn is None: 604 floatformat_fiteqn = floatformat 605 606 # Default location in lower right corner 607 if loc is None: 608 loc = (0.8,0.05) 609 610 # Coordinates for loc 611 if loc_units.lower()=='data': 612 coord=ax.transData 613 elif loc_units.lower() in ['axes','axis']: 614 coord=ax.transAxes 615 else: 616 raise ValueError('Display units should be "Data" or "Axes"') 617 618 # Get a dict containing the needed variables 619 summarydict = self.summary_dict( variables, fitline_kw, floatformat_fiteqn ) 620 621 # Column of label text 622 label_text = '\n'.join([_texify_name(key) for key in summarydict]) 623 # Column of value text 624 value_text = '\n'.join([value if isinstance(value,str) else floatformat.format(value) 625 for value in summarydict.values()]) 626 627 # Check if horizontal alignment keyword is used 628 ha='' 629 try: 630 ha = kwargs['ha'] 631 except KeyError: 632 pass 633 try: 634 ha = kwargs['horizontalalignment'] 635 except KeyError: 636 pass 637 638 # For right alignment, align on values first 639 # Otherwise, align on labels 640 if ha=='right': 641 first_text = value_text 642 second_text = label_text 643 sign = -1 644 else: 645 first_text = label_text 646 second_text = value_text 647 sign = +1 648 649 # Add first column of text 650 t1=ax.text(loc[0],loc[1], 651 first_text, 652 transform=coord, 653 **kwargs 654 ) 655 656 # Get width of first text column 657 bbox = t1.get_window_extent().transformed(coord.inverted()) 658 width = bbox.x1-bbox.x0 659 660 # Add second column of text 661 t2 = ax.text(loc[0]+width*sign,loc[1], 662 second_text, 663 transform=coord, 664 **kwargs 665 ) 666 667 ################################## 668 # Early version of this function using matplotlib.table.table() 669 670 # if isinstance(loc,(tuple,list)): 671 # # Create an inset axis to contain the table 672 # tableaxis = ax.inset_axes(loc) 673 # table_width=1 674 # else: 675 # tableaxis = ax 676 677 # # Display the table on the axis 678 # return mtable.table( 679 # tableaxis, 680 # cellText=[[floatformat.format(value)] for value in summarydict.values()], 681 # rowLabels=[texify_name(key) for key in summarydict], 682 # colWidths=[table_width/2]*2, 683 # edges=edges, 684 # loc=loc, bbox=bbox 685 # ) 686 687 return [t1,t2]
Display bivariate statistics as a table inset on a plot axis
Parameters
- ax (matplotlib.Figure.Axis): axis where the table will be displayed
- variables (list or str, default='common'):
names of attribute variables to include in summary
names are case insensitive
The following strings are also accepted in place of a list "all" (displays all variables) "common" (displays all measures of mean difference) - fitline_kw (dict, default=None):
keywords passed to
fitline
- floatformat : str, default='{ (.3f}'): format specifier for floating point values
- floatformat_fiteqn (str, default=floatformat): format specifier for slope and intercept (a,b) in y = a x + b
- loc (tuple (x0,y0), default=(0.85, 0.05)): location on the axis where the table will be drawn can be in data units or axes units [0-1]
- loc_units ({'axes' (default), 'data'}): specifies whether loc has 'data' units or 'axes' units [0-1]
Returns
- text1, text2 (matplotlib text object): Artist for the two text boxes
22def nmb( x0, x1 ): 23 '''Compute Normalized Mean Bias (NMB) 24 25 NMB = ( mean(x1) - mean(x0) ) / mean(x0) 26 27 Parameters 28 ---------- 29 x0 : array_like 30 reference values 31 x1 : array_like 32 experiment values 33 ''' 34 35 assert (len(x0) == len(x1)), \ 36 "Parameters x0 and x1 must have the same length" 37 38 # Mean values 39 x0_mean = np.mean(x0) 40 x1_mean = np.mean(x1) 41 42 # Metric value 43 return x1_mean / x0_mean - 1
Compute Normalized Mean Bias (NMB)
NMB = ( mean(x1) - mean(x0) ) / mean(x0)
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
45def nmae( x0, x1 ): 46 '''Compute Normalized Mean Absolute Error (NMAE) 47 48 NMAE = mean(abs(x1 - x0)) / abs(mean(x0)) 49 50 Parameters 51 --------- 52 x0 : array_like 53 reference values 54 x1 : array_like 55 experiment values 56 ''' 57 58 # Mean values 59 x0_mean = np.mean(x0) 60 61 # Mean absolute difference 62 abs_diff = np.mean( np.abs(x1 - x0) ) 63 64 # Metric value 65 return abs_diff / np.abs( x0_mean )
Compute Normalized Mean Absolute Error (NMAE)
NMAE = mean(abs(x1 - x0)) / abs(mean(x0))
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
68def nmbf( x0, x1 ): 69 '''Compute Normalized Mean Bias Factor (NMBF) 70 71 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 72 73 Parameters 74 ---------- 75 x0 : array_like 76 reference values 77 x1 : array_like 78 experiment values 79 ''' 80 81 # Ensure that arguments have the same length 82 assert (len(x0) == len(x1)), \ 83 "Parameters x0 and x1 must have the same length" 84 85 # Mean values 86 x0_mean = np.mean(x0) 87 x1_mean = np.mean(x1) 88 89 # Metric value 90 if x1_mean >= x0_mean: 91 result = x1_mean / x0_mean - 1 92 else: 93 result= 1 - x0_mean / x1_mean 94 # Equivalent (faster?) implementation 95 #S = (mMean - oMean) / np.abs(mMean - oMean) 96 #result = S * ( np.exp( np.abs( mMean / oMean )) - 1 ) 97 98 return result
Compute Normalized Mean Bias Factor (NMBF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values
100def nmaef( x0, x1 ): 101 '''Compute Normalized Mean Absolute Error Factor (NMAEF) 102 103 Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125 104 105 Parameters 106 ---------- 107 x0 : array_like 108 reference values 109 x1 : array_like 110 experiment values 111 ''' 112 113 # Ensure that arguments have the same length 114 assert (len(x0) == len(x1)), \ 115 "Parameters x0 and x1 must have the same length" 116 117 # Mean values 118 x0_mean = np.mean(x0) 119 x1_mean = np.mean(x1) 120 121 # Mean absolute difference 122 abs_diff = np.mean( np.abs(x1 - x0)) 123 124 # Metric value 125 if x1_mean >= x0_mean: 126 result = abs_diff / x0_mean 127 else: 128 result = abs_diff / x1_mean 129 # Equivalent (faster?) implementation 130 #S = (exp_mean - ref_mean) / np.abs(exp_mean - ref_mean) 131 #result = abs_diff / ( oMean**((1+S)/2) * mMean**((1-S)/2) ) 132 133 return result
Compute Normalized Mean Absolute Error Factor (NMAEF)
Definition from Yu et al. (2006) https://doi.org/10.1002/asl.125
Parameters
- x0 (array_like): reference values
- x1 (array_like): experiment values