# Copyright (c) 2014, Vienna University of Technology (TU Wien), Department
# of Geodesy and Geoinformation (GEO).
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the Vienna University of Technology - Department of
# Geodesy and Geoinformation nor the names of its contributors may be used to
# endorse or promote products derived from this software without specific
# prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Author: Thomas Mistelbauer Thomas.Mistelbauer@geo.tuwien.ac.at
# Creation date: 2014-06-30
import os
import shutil
import pandas as pd
import numpy as np
import poets.image.netcdf as nc
import poets.timedate.dateindex as dt
import poets.grid.grids as gr
from netCDF4 import Dataset, num2date, date2num
from datetime import datetime, timedelta
from poets.grid.grids import ShapeGrid, RegularGrid
from poets.image.resampling import resample_to_shape, average_layers
from poets.io.download import download_http, download_ftp, download_sftp, \
get_file_date, download_local
from poets.io.fileformats import select_file
from poets.io.unpack import unpack, check_compressed
[docs]class BasicSource(object):
"""Base Class for data sources.
Parameters
----------
name : str
Name of the data source.
filename : str
Structure/convention of the file name.
filedate : dict
Position of date fields in filename, given as tuple.
temp_res : str
Temporal resolution of the source.
rootpath : str
Root path where all data will be stored.
host : str
Link to data host.
protocol : str
Protocol for data transfer.
username : str, optional
Username for data access.
password : str, optional
Password for data access.
port : int, optional
Port to data host, defaults to 22.
directory : str, optional
Path to data on host.
dirstruct : list of strings, optional
Structure of source directory, each list item represents a
subdirectory.
begin_date : datetime, optional
Date from which on data is available.
variables : string or list of strings, optional
Variables used from data source, defaults to ['dataset'].
nan_value : int, float, optional
Nan value of the original data as given by the data provider.
valid_range : tuple of int of float, optional
Valid range of data, given as (minimum, maximum).
data_range : tuple of int of float, optional
Range of the values as data given in rawdata (minimum, maximum).
Will be scaled to valid_range.
ffilter : str, optional
Pattern that apperas in filename. Can be used to select out not
needed files if multiple files per date are provided.
colorbar : str, optional
Colorbar to use, use one from
http://matplotlib.org/examples/color/colormaps_reference.html,
defaults to jet.
unit : str, optional
Unit of dataset for displaying in legend. Does not have to be set
if unit is specified in input file metadata. Defaults to None.
dest_nan_value : int, float, optional
NaN value in the final NetCDF file.
dest_regions : list of str, optional
Regions of interest where data should be resampled to.
dest_sp_res : int, float, optional
Spatial resolution of the destination NetCDF file, defaults to 0.25
degree.
dest_temp_res : string, optional
Temporal resolution of the destination NetCDF file, possible values:
('day', 'week', 'dekad', 'month'), defaults to dekad.
dest_start_date : datetime, optional
Start date of the destination NetCDF file, defaults to 2000-01-01.
Attributes
----------
name : str
Name of the data source.
filename : str
Structure/convention of the file name.
filedate : dict
Position of date fields in filename, given as tuple.
temp_res : str
Temporal resolution of the source.
host : str
Link to data host.
protocol : str
Protocol for data transfer.
username : str
Username for data access.
password : str
Password for data access.
port : int
Port to data host.
directory : str
Path to data on host.
dirstruct : list of strings
Structure of source directory, each list item represents a
subdirectory.
begin_date : datetime
Date from which on data is available.
ffilter : str
Pattern that apperas in filename.
colorbar : str, optional
Colorbar to used.
unit : str
Unit of dataset for displaying in legend.
variables : list of strings
Variables used from data source.
nan_value : int, float
Not a number value of the original data as given by the data provider.
valid_range : tuple of int of float
Valid range of data, given as (minimum, maximum).
data_range : tuple of int of float
Range of the values as data given in rawdata (minimum, maximum).
dest_nan_value : int, float, optional
NaN value in the final NetCDF file.
tmp_path : str
Path where temporary files are stored.
rawdata_path : str
Path where original files are stored.
data_path : str
Path where resampled NetCDF file is stored.
dest_regions : list of str
Regions of interest where data is resampled to.
dest_sp_res : int, float
Spatial resolution of the destination NetCDF file.
dest_temp_res : string
Temporal resolution of the destination NetCDF file.
dest_start_date : datetime.datetime
First date of the dataset in the destination NetCDF file.
"""
def __init__(self, name, filename, filedate, temp_res, rootpath,
host, protocol, username=None, password=None, port=22,
directory=None, dirstruct=None,
begin_date=None, ffilter=None, colorbar='jet',
variables=None, nan_value=None, valid_range=None, unit=None,
dest_nan_value=-99, dest_regions=None, dest_sp_res=0.25,
dest_temp_res='dekad', dest_start_date=datetime(2000, 1, 1),
data_range=None):
self.name = name
self.filename = filename
self.filedate = filedate
self.temp_res = temp_res
self.host = host
self.protocol = protocol
self.username = username
self.password = password
self.port = port
self.directory = directory
self.dirstruct = dirstruct
if begin_date is None:
self.begin_date = dest_start_date
else:
self.begin_date = begin_date
if type(variables) == str:
self.variables = [variables]
else:
self.variables = variables
self.ffilter = ffilter
self.unit = unit
self.nan_value = nan_value
self.valid_range = valid_range
self.data_range = data_range
self.colorbar = colorbar
self.dest_nan_value = dest_nan_value
self.dest_regions = dest_regions
self.dest_sp_res = dest_sp_res
self.dest_temp_res = dest_temp_res
self.dest_start_date = dest_start_date
self.rawdata_path = os.path.join(rootpath, 'RAWDATA', name)
self.tmp_path = os.path.join(rootpath, 'TMP')
if not os.path.exists(self.tmp_path):
os.mkdir(self.tmp_path)
self.data_path = os.path.join(rootpath, 'DATA')
if not os.path.exists(self.data_path):
os.mkdir(self.data_path)
if self.host[-1] != '/':
self.host += '/'
if self.directory is not None and self.directory[-1] != '/':
self.directory += '/'
def _check_current_date(self, begin=True, end=True):
"""Helper method that checks the current date of individual variables
in the netCDF data file.
Parameters
----------
begin : bool, optional
If set True, begin will be returned as None.
end : bool, optional
If set True, end will be returned as None.
Returns
-------
dates : dict of dicts
Dictionary with dates of each parameter. None if no date available.
"""
dates = {}
for region in self.dest_regions:
nc_name = os.path.join(self.data_path, region + '_'
+ str(self.dest_sp_res) + '_'
+ str(self.dest_temp_res) + '.nc')
if os.path.exists(nc_name):
dates[region] = {}
variables = self.get_variables()
with Dataset(nc_name, 'r', format='NETCDF4') as nc:
for var in variables:
dates[region][var] = []
if begin:
for i in range(0, nc.variables['time'].size - 1):
if nc.variables[var][i].mask.min():
continue
else:
times = nc.variables['time']
dat = num2date(nc.variables['time'][i],
units=times.units,
calendar=times.calendar)
dates[region][var].append(dat)
break
else:
dates[region][var].append(None)
if end:
for i in range(nc.variables['time'].size - 1,
- 1, -1):
if nc.variables[var][i].mask.min():
continue
else:
times = nc.variables['time']
dat = num2date(nc.variables['time'][i],
units=times.units,
calendar=times.calendar)
dates[region][var].append(dat)
break
else:
dates[region][var].append(None)
if dates[region][var] in [[None], []]:
dates[region][var] = [None, None]
else:
dates = None
break
return dates
def _get_download_date(self):
"""Gets the date from which to start the data download.
Returns
-------
begin : datetime
date from which to start the data download.
"""
dates = self._check_current_date(begin=False)
if dates is not None:
begin = datetime.now()
for region in self.dest_regions:
variables = self.get_variables()
if variables == []:
begin = self.dest_start_date
else:
for var in variables:
if dates[region][var][1] is not None:
if dates[region][var][1] < begin:
begin = dates[region][var][1]
begin += timedelta(days=1)
else:
if self.dest_start_date < self.begin_date:
begin = self.begin_date
else:
begin = self.dest_start_date
else:
begin = self.begin_date
return begin
def _get_tmp_filepath(self, prefix, region):
"""Creates path to a temporary directory.
Returns
-------
str
Path to the temporary direcotry
"""
filename = ('_' + prefix + '_' + region + '_' + str(self.dest_sp_res)
+ '_' + str(self.dest_temp_res) + '.nc')
return os.path.join(self.tmp_path, filename)
def _resample_spatial(self, region, begin, end, delete_rawdata,
shapefile=None):
"""Helper method that calls spatial resampling routines.
Parameters:
region : str
FIPS country code (https://en.wikipedia.org/wiki/FIPS_country_code)
begin : datetime
Start date of resampling
end : datetime
End date of resampling
delete_rawdata : bool
True if original downloaded files should be deleted after
resampling
"""
dest_file = self._get_tmp_filepath('spatial', region)
dirList = os.listdir(self.rawdata_path)
dirList.sort()
if region == 'global':
grid = gr.RegularGrid(sp_res=self.dest_sp_res)
else:
grid = gr.ShapeGrid(region, self.dest_sp_res, shapefile)
for item in dirList:
src_file = os.path.join(self.rawdata_path, item)
fdate = get_file_date(item, self.filedate)
if begin is not None:
if fdate < begin:
continue
if end is not None:
if fdate > end:
continue
if check_compressed(src_file):
dirname = os.path.splitext(item)[0]
dirpath = os.path.join(self.rawdata_path, dirname)
unpack(src_file)
src_file = select_file(os.listdir(dirpath))
src_file = os.path.join(dirpath, src_file)
if begin is not None:
if fdate < begin:
if check_compressed(item):
shutil.rmtree(os.path.join(self.rawdata_path,
os.path.splitext(item)[0]))
continue
if end is not None:
if fdate > end:
if check_compressed(item):
shutil.rmtree(os.path.join(self.rawdata_path,
os.path.splitext(item)[0]))
continue
print '.',
image, _, _, _, timestamp, metadata = \
resample_to_shape(src_file, region, self.dest_sp_res, grid,
self.name, self.nan_value,
self.dest_nan_value, self.variables,
shapefile)
if timestamp is None:
timestamp = get_file_date(item, self.filedate)
if self.temp_res == self.dest_temp_res:
filename = (region + '_' + str(self.dest_sp_res) + '_'
+ str(self.dest_temp_res) + '.nc')
dfile = os.path.join(self.data_path, filename)
nc.save_image(image, timestamp, region, metadata, dfile,
self.dest_start_date, self.dest_sp_res,
self.dest_nan_value, shapefile,
self.dest_temp_res)
else:
nc.write_tmp_file(image, timestamp, region, metadata,
dest_file, self.dest_start_date,
self.dest_sp_res, self.dest_nan_value,
shapefile)
# deletes unpacked files if existing
if check_compressed(item):
shutil.rmtree(os.path.join(self.rawdata_path,
os.path.splitext(item)[0]))
print ''
def _resample_temporal(self, region, shapefile=None):
"""Helper method that calls temporal resampling routines.
Parameters:
region : str
Identifier of the region in the shapefile. If the default shapefile
is used, this would be the FIPS country code.
shapefile : str, optional
Path to shape file, uses "world country admin boundary shapefile"
by default.
"""
src_file = self._get_tmp_filepath('spatial', region)
if not os.path.exists(src_file):
print '[Info] No data available for this period'
return False
data = {}
variables, _, period = nc.get_properties(src_file)
dtindex = dt.get_dtindex(self.dest_temp_res, period[0], period[1])
for date in dtindex:
if date > period[1]:
continue
print date
if self.dest_temp_res == 'dekad':
if date.day < 21:
begin = datetime(date.year, date.month, date.day - 10 + 1)
else:
begin = datetime(date.year, date.month, 21)
end = date
else:
begin = period[0]
end = date
data = {}
metadata = {}
for var in variables:
img, _, _, meta = \
nc.read_variable(src_file, var, begin, end)
metadata[var] = meta
data[var] = average_layers(img, self.dest_nan_value)
filename = (region + '_' + str(self.dest_sp_res) + '_'
+ str(self.dest_temp_res) + '.nc')
dest_file = os.path.join(self.data_path, filename)
nc.save_image(data, date, region, metadata, dest_file,
self.dest_start_date, self.dest_sp_res,
self.dest_nan_value, shapefile, self.dest_temp_res)
# delete intermediate netCDF file
print ''
os.unlink(src_file)
def _scale_values(self, data):
if self.valid_range is not None:
if self.data_range is not None:
data = ((data - self.data_range[0]) /
(self.data_range[1] - self.data_range[0]) *
(self.valid_range[1] - self.valid_range[0]) +
self.valid_range[0])
return data
[docs] def download(self, download_path=None, begin=None, end=None):
""""Download data
Parameters
----------
begin : datetime, optional
start date of download, default to None
end : datetime, optional
start date of download, default to None
"""
if begin is None:
if self.dest_start_date < self.begin_date:
begin = self.begin_date
else:
begin = self.dest_start_date
if self.protocol in ['HTTP', 'http']:
check = download_http(self.rawdata_path, self.host,
self.directory, self.filename, self.filedate,
self.dirstruct, begin=begin, end=end,
ffilter=self.ffilter)
elif self.protocol in ['FTP', 'ftp']:
check = download_ftp(self.rawdata_path, self.host, self.directory,
self.filedate, self.port, self.username,
self.password, self.dirstruct, begin=begin,
end=end, ffilter=self.ffilter)
elif self.protocol in ['SFTP', 'sftp']:
check = download_sftp(self.rawdata_path, self.host,
self.directory, self.port, self.username,
self.password, self.filedate, self.dirstruct,
begin=begin, end=end, ffilter=self.ffilter)
elif self.protocol in ['local', 'LOCAL']:
check = download_local(self.rawdata_path, directory=self.host,
filedate=self.filedate,
dirstruct=self.dirstruct, begin=begin,
end=end, ffilter=self.ffilter)
return check
[docs] def resample(self, begin=None, end=None, delete_rawdata=False,
shapefile=None, stepwise=True):
"""Resamples source data to given spatial and temporal resolution.
Writes resampled images into a netCDF data file. Deletes original
files if flag delete_rawdata is set True.
Parameters
----------
begin : datetime
Start date of resampling.
end : datetime
End date of resampling.
delete_rawdata : bool
Original files will be deleted from rawdata_path if set 'True'.
shapefile : str, optional
Path to shape file, uses "world country admin boundary shapefile"
by default.
"""
if len(os.listdir(self.tmp_path)) != 0:
for fname in os.listdir(self.tmp_path):
if '.nc' in fname:
os.remove(os.path.join(self.tmp_path, fname))
if begin is None:
if self.dest_start_date < self.begin_date:
begin = self.begin_date
else:
begin = self.dest_start_date
if begin < self._get_download_date():
begin = self._get_download_date()
if end is None:
end = datetime.now()
if begin > end:
print '[INFO] everything up to date'
return '[INFO] everything up to date'
if stepwise:
drange = dt.get_dtindex(self.dest_temp_res, begin, end)
for i, date in enumerate(drange):
if date > end:
continue
if i == 0:
start = begin
else:
if self.dest_temp_res in ['dekadal', 'weekly']:
start = drange[i - 1] + timedelta(days=1)
else:
start = date
stop = date
print '[INFO] ' + str(start) + '-' + str(stop)
for region in self.dest_regions:
print '[INFO] resampling to region ' + region
print '[INFO] performing spatial resampling ',
self._resample_spatial(region, start, stop, delete_rawdata,
shapefile)
if self.temp_res == self.dest_temp_res:
print '[INFO] skipping temporal resampling'
else:
print '[INFO] performing temporal resampling ',
self._resample_temporal(region, shapefile)
else:
print '[INFO] ' + str(begin) + '-' + str(end)
for region in self.dest_regions:
print '[INFO] resampling to region ' + region
print '[INFO] performing spatial resampling ',
self._resample_spatial(region, begin, end, delete_rawdata,
shapefile)
if self.temp_res == self.dest_temp_res:
print '[INFO] skipping temporal resampling'
else:
print '[INFO] performing temporal resampling ',
self._resample_temporal(region, shapefile)
if delete_rawdata:
print '[INFO] Cleaning up rawdata'
dirList = os.listdir(self.rawdata_path)
dirList.sort()
for item in dirList:
src_file = os.path.join(self.rawdata_path, item)
os.unlink(src_file)
[docs] def download_and_resample(self, download_path=None, begin=None, end=None,
delete_rawdata=False, shapefile=None):
"""Downloads and resamples data.
Parameters
----------
download_path : str
Path where to save the downloaded files.
begin : datetime.date, optional
set either to first date of remote repository or date of
last file in local repository
end : datetime.date, optional
set to today if none given
delete_rawdata : bool, optional
Original files will be deleted from rawdata_path if set True
shapefile : str, optional
Path to shape file, uses "world country admin boundary shapefile"
by default.
"""
if begin is None:
if self.dest_start_date < self.begin_date:
begin = self.begin_date
else:
begin = self.dest_start_date
if begin < self._get_download_date():
begin = self._get_download_date()
if end is None:
end = datetime.now()
if begin > end:
print '[INFO] everything up to date'
return '[INFO] everything up to date'
drange = dt.get_dtindex(self.dest_temp_res, begin, end)
intervals = []
for i, date in enumerate(drange):
if date > end:
continue
if i == 0:
start = begin
else:
if self.dest_temp_res in ['dekad', 'dekadal', 'week',
'weekly']:
start = drange[i - 1] + timedelta(days=1)
else:
start = date
stop = date
filecheck = self.download(download_path, start, stop)
if filecheck is True:
self.resample(start, stop, delete_rawdata, shapefile, False)
else:
print '[WARNING] no data available for this date'
[docs] def read_ts(self, location, region=None, variable=None, shapefile=None,
scaled=True):
"""Gets timeseries from netCDF file for a gridpoint.
Parameters
----------
location : int or tuple of floats
Either Grid point index as integer value or Longitude/Latitude
given as tuple.
region : str, optional
Region of interest, set to first defined region if not set.
variable : str, optional
Variable to display, selects all available variables if None.
shapefile : str, optional
Path to custom shapefile.
scaled : bool, optional
If true, data will be scaled to a predefined range; if false, data
will be shown as given in rawdata file; defaults to True
Returns
-------
df : pd.DataFrame
Timeseries for selected variables.
"""
if region is None:
region = self.dest_regions[0]
if type(location) is tuple:
if region == 'global':
grid = RegularGrid(self.dest_sp_res)
else:
grid = ShapeGrid(region, self.dest_sp_res, shapefile)
gp, _ = grid.find_nearest_gpi(location[0], location[1])
else:
gp = location
if variable is None:
if self.variables is None:
variable = self.get_variables()
else:
variable = self.variables
else:
variable = [variable]
source_file = os.path.join(self.data_path,
region + '_' + str(self.dest_sp_res) + '_'
+ str(self.dest_temp_res) + '.nc')
var_dates = self._check_current_date()
with Dataset(source_file, 'r', format='NETCDF4') as nc:
time = nc.variables['time']
dates = num2date(time[:], units=time.units, calendar=time.calendar)
position = np.where(nc.variables['gpi'][:] == gp)
lat_pos = position[0][0]
lon_pos = position[1][0]
df = pd.DataFrame(index=pd.DatetimeIndex(dates))
for var in variable:
if self.name not in var:
ncvar = self.name + '_' + var
else:
ncvar = var
begin = np.where(dates == var_dates[region][ncvar][0])[0][0]
end = np.where(dates == var_dates[region][ncvar][1])[0][0]
df[ncvar] = np.NAN
for i in range(begin, end + 1):
df[ncvar][i] = nc.variables[ncvar][i, lat_pos, lon_pos]
if 'scaling_factor' in nc.variables[ncvar].ncattrs():
vvar = nc.variables[ncvar]
if vvar.getncattr('scaling_factor') < 0:
df[ncvar] = (df[ncvar] *
float(vvar.getncattr('scaling_factor')))
else:
df[ncvar] = (df[ncvar] /
float(vvar.getncattr('scaling_factor')))
if scaled:
if self.valid_range is not None:
if self.data_range is not None:
df[ncvar] = self._scale_values(df[ncvar])
return df
[docs] def read_img(self, date, region=None, variable=None, scaled=True):
"""Gets images from netCDF file for certain date
Parameters
----------
date : datetime
Date of the image.
region : str, optional
Region of interest, set to first defined region if not set.
variable : str, optional
Variable to display, selects first available variables if None.
scaled : bool, optional
If true, data will be scaled to a predefined range; if false, data
will be shown as given in rawdata file; defaults to True.
Returns
-------
img : numpy.ndarray
Image of selected date.
lon : numpy.array
Array with longitudes.
lat : numpy.array
Array with latitudes.
metadata : dict
Dictionary containing metadata of the variable.
"""
if region is None:
region = self.dest_regions[0]
if variable is None:
if self.variables is None:
variable = self.get_variables()[0]
else:
variable = self.name + '_' + self.variables[0]
else:
# Renames variable name to SOURCE_variable
if self.name not in variable:
variable = self.name + '_' + variable
source_file = os.path.join(self.data_path,
region + '_' + str(self.dest_sp_res)
+ '_' + str(self.dest_temp_res) + '.nc')
# get dekad of date:
date = dt.check_period(self.dest_temp_res, date)
with Dataset(source_file, 'r', format='NETCDF4') as nc:
time = nc.variables['time']
datenum = date2num(date, units=time.units, calendar=time.calendar)
position = np.where(time[:] == datenum)[0][0]
var = nc.variables[variable]
img = var[position]
lon = nc.variables['lon'][:]
lat = nc.variables['lat'][:]
metadata = {}
for attr in var.ncattrs():
if attr[0] != '_' and attr != 'scale_factor':
metadata[attr] = var.getncattr(attr)
if not metadata:
metadata = None
if 'scaling_factor' in var.ncattrs():
if metadata['scaling_factor'] < 0:
img = img * float(metadata['scaling_factor'])
else:
img = img / float(metadata['scaling_factor'])
if scaled:
if self.valid_range is not None:
if self.data_range is not None:
img = self._scale_values(img)
return img, lon, lat, metadata
[docs] def get_variables(self):
"""
Gets all variables given in the NetCDF file.
Returns
-------
variables : list of str
Variables from given in the NetCDF file.
"""
nc_name = os.path.join(self.data_path, self.dest_regions[0] + '_'
+ str(self.dest_sp_res) + '_'
+ str(self.dest_temp_res) + '.nc')
nc_vars, _, _ = nc.get_properties(nc_name)
variables = []
for var in nc_vars:
if self.name in var:
variables.append(var)
return variables