Source code for pytesmo.temporal_matching

'''
Created on Apr 12, 2013

Provides a temporal matching function

@author: Sebastian Hahn Sebastian.Hahn@geo.tuwien.ac.at
'''
import numpy as np
import scipy.interpolate as sc_int
import pandas as pd


[docs]def df_match(reference, *args, **kwds): ''' Finds temporal match between the reference pandas.DataFrame (index has to be datetime) and n other pandas.DataFrame (index has to be datetime). Parameters ---------- reference : pandas.DataFrame or pandas.TimeSeries The index of this dataframe will be the reference. *args : pandas.DataFrame or pandas.TimeSeries The index of this dataframe(s) will be matched. window : float Fraction of days of the maximum pos./neg. distance allowed, i.e. the value of window represents the half-winow size (e.g. window=0.5, will search for matches between -12 and +12 hours) (default: None) dropna : boolean Drop rows containing only NaNs (default: False) dropduplicates : boolean Drop duplicated temporal matched (default: False) Returns ------- temporal_matched_args : pandas.DataFrame or tuple of pandas.DataFrame Dataframe with index from matched reference index ''' if "window" in kwds: window = kwds['window'] else: window = None temporal_matched_args = [] ref_step = reference.index.values - reference.index.values[0] for arg in args: if type(arg) == pd.TimeSeries: arg=pd.DataFrame(arg) comp_step = arg.index.values - reference.index.values[0] matched = sc_int.griddata(comp_step, np.arange(comp_step.size), ref_step, "nearest") distance = np.zeros_like(matched) distance.fill(np.nan) valid_match = np.invert(np.isnan(matched)) distance[valid_match] = \ (arg.index.values[np.int32(matched[valid_match])] - reference.index.values[valid_match]) / np.timedelta64(1, 'D') arg['index'] = arg.index.values arg['merge_key'] = np.arange(len(arg)) arg_matched = pd.DataFrame({'merge_key': matched, 'distance': distance, 'ref_index': reference.index.values}) arg_matched = arg_matched.merge(arg, on="merge_key", how="left") arg_matched.index = arg_matched['ref_index'].values arg_matched = arg_matched.sort_index() if window is not None: invalid_dist = arg_matched['distance'].abs() > window arg_matched.loc[invalid_dist] = np.nan if "dropna" in kwds and kwds['dropna']: arg_matched = arg_matched.dropna() if "dropduplicates" in kwds and kwds['dropduplicates']: arg_matched = arg_matched.dropna() g = arg_matched.groupby('merge_key') min_dists = g.distance.apply(lambda x: x.abs().idxmin()) arg_matched = arg_matched.ix[min_dists] temporal_matched_args.append(\ arg_matched.drop(['merge_key', 'ref_index'], axis=1)) if len(temporal_matched_args) == 1: return temporal_matched_args[0] else: return tuple(temporal_matched_args)
[docs]def matching(reference,*args,**kwargs): ''' Finds temporal match between the reference pandas.TimeSeries (index has to be datetime) and n other pandas.TimeSeries (index has to be datetime). Parameters ---------- reference : pandas.TimeSeries The index of this Series will be the reference. *args : pandas.TimeSeries The index of these Series(s) will be matched. window : float Fraction of days of the maximum pos./neg. distance allowed, i.e. the value of window represents the half-winow size (e.g. window=0.5, will search for matches between -12 and +12 hours) (default: None) Returns ------- temporal_match : pandas.DataFrame containing the index of the reference Series and a column for each of the other input Series ''' matched_datasets = df_match(reference, *args,dropna=True,dropduplicates=True,**kwargs) if type(matched_datasets) != tuple: matched_datasets = (matched_datasets) matched_data = pd.DataFrame(reference) for match in matched_datasets: match = match.drop(['distance','index'],axis=1) matched_data = matched_data.join(match) return matched_data.dropna()