#Copyright (c) 2013,Vienna University of Technology, Department of Geodesy and Geoinformation
#All rights reserved.
#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Vienna University of Technology, Department of Geodesy and Geoinformation nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
#DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
Module contains wrappers for methods in pytesmo.metrics which can be given pandas.DataFrames
instead of single numpy.arrays . If the DataFrame has more columns than the function has input parameters
the function will be applied pairwise
Created on Aug 14, 2013
@author: Christoph Paulik Christoph.Paulik@geo.tuwien.ac.at
'''
import numpy as np
import pytesmo.metrics as metrics
from collections import namedtuple
[docs]class DataFrameDimensionError(Exception):
pass
[docs]def bias(df):
"""Bias
Returns
-------
bias : pandas.Dataframe
of shape (len(df.columns),len(df.columns))
See Also
--------
pytesmo.metrics.bias
"""
return pairwise_apply(df,metrics.bias)
[docs]def rmsd(df):
"""Root-mean-square deviation
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.rmsd
"""
return _to_namedtuple(pairwise_apply(df,metrics.rmsd,comm=True),'rmsd')
[docs]def nrmsd(df):
"""Normalized root-mean-square deviation
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.nrmsd
"""
return _to_namedtuple(pairwise_apply(df,metrics.nrmsd,comm=True),'nrmsd')
[docs]def ubrmsd(df):
"""Unbiased root-mean-square deviation
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.ubrmsd
"""
return _to_namedtuple(pairwise_apply(df,metrics.ubrmsd,comm=True),'ubrmsd')
[docs]def mse(df):
"""Mean square error (MSE) as a decomposition of the RMSD into individual error components
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.mse
"""
MSE, MSEcorr, MSEbias, MSEvar = pairwise_apply(df,metrics.mse,comm=True)
return (_to_namedtuple(MSE, 'MSE'), _to_namedtuple(MSEcorr, 'MSEcorr'),
_to_namedtuple(MSEbias, 'MSEbias'), _to_namedtuple(MSEvar, 'MSEvar'))
[docs]def tcol_error(df):
"""Triple collocation error estimate
In this case df has to have exactly 3 columns, since triple wise
application of a function is not yet implemented and would probably return a
complicated structure
Returns
-------
result : namedtuple
with column names of df
See Also
--------
pytesmo.metrics.tcol_error
"""
if len(df.columns) != 3:
raise DataFrameDimensionError("DataFrame has to have 3 columns")
tcol_result = namedtuple('triple_collocation_error',df.columns)
return tcol_result._make(metrics.tcol_error(df.ix[:,0].values, df.ix[:,1].values, df.ix[:,2].values))
[docs]def nash_sutcliffe(df):
"""Nash Sutcliffe model efficiency coefficient
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.nash_sutcliffe
"""
return _to_namedtuple(pairwise_apply(df,metrics.nash_sutcliffe,comm=True),'Nash_Sutcliffe')
[docs]def pearsonr(df):
"""
Wrapper for scipy.stats.pearsonr
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.pearsonr
scipy.stats.pearsonr
"""
r, p = pairwise_apply(df,metrics.pearsonr,comm=True)
return _to_namedtuple(r, 'Pearsons_r'), _to_namedtuple(p, 'p_value')
[docs]def spearmanr(df):
"""
Wrapper for scipy.stats.spearmanr
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.spearmenr
scipy.stats.spearmenr
"""
r, p = pairwise_apply(df,metrics.spearmanr,comm=True)
return _to_namedtuple(r, 'Spearman_r'), _to_namedtuple(p, 'p_value')
[docs]def kendalltau(df):
"""
Wrapper for scipy.stats.kendalltau
Returns
-------
result : namedtuple
with column names of df for which the calculation was done as name of the
element separated by '_and_'
See Also
--------
pytesmo.metrics.kendalltau
scipy.stats.kendalltau
"""
r, p = pairwise_apply(df,metrics.kendalltau,comm=True)
return _to_namedtuple(r, 'Kendall_tau'), _to_namedtuple(p, 'p_value')
[docs]def pairwise_apply(df, method, comm=False):
"""
Compute given method pairwise for all columns, excluding NA/null values
Parameters
----------
df : pandas.DataFrame
input data, method will be applied to each column pair
method : function
method to apply to each column pair. has to take 2 input arguments of
type numpy.array and return one value or tuple of values
Returns
-------
results : pandas.DataFrame
"""
numeric_df = df._get_numeric_data()
cols = numeric_df.columns
mat = numeric_df.values
mat = mat.T
applyf = method
K = len(cols)
result_empty = np.empty((K, K), dtype=float)
result_empty.fill(np.nan)
# find out how many variables the applyf returns
c = applyf(mat[0], mat[0])
result = []
for index, value in enumerate(np.atleast_1d(c)):
result.append(result_empty)
result = np.array(result)
mask = np.isfinite(mat)
for i, ac in enumerate(mat):
for j, bc in enumerate(mat):
if i == j: continue
if comm and np.isfinite(result[0][i, j]): continue
valid = mask[i] & mask[j]
if not valid.any():continue
if not valid.all():
c = applyf(ac[valid], bc[valid])
else:
c = applyf(ac, bc)
for index, value in enumerate(np.atleast_1d(c)):
result[index][i, j] = value
if comm: result[index][j, i] = value
return_list = []
for data in result:
return_list.append(df._constructor(data, index=cols, columns=cols))
if len(return_list) == 1:
return return_list[0]
else:
return tuple(return_list)
def _to_namedtuple(df,name):
"""
takes df produced by pairwise apply and produces named tuple
of the non duplicate values for commutative operations(the triangle
above the diagonal)
"""
names=[]
values=[]
for i,column in enumerate(df.columns[:-1]):
for column_names in df.columns[i+1:]:
names.append('_and_'.join([df.index[i],column_names]))
values.extend(df[column].values[i+1:])
result = namedtuple(name,names)
return result._make(values)