Source code for pytesmo.io.netcdf.netcdf_dataset

"""
Reads and writes netcdf time series according to the Climate Forecast Metadata
Conventions

Created on Dec 09, 2013

@author: Christoph Paulik christoph.paulik@geo.tuwien.ac.at
"""

import os
import numpy as np
import netCDF4
import datetime
import pandas as pd

import pytesmo.io.dataset_base as dsbase
import pytesmo.grid.grids as grids


[docs]class DatasetError(Exception): pass
[docs]class Dataset(object): """ NetCDF file wrapper class that makes some things easier Parameters ---------- filename : string filename of netCDF file. If already exiting then it will be opened as read only unless the append keyword is set. if the overwrite keyword is set then the file will be overwritten name : string, optional will be written as a global attribute if the file is a new file file_format : string, optional file format mode : string, optional access mode. default 'r' 'r' means read-only; no data can be modified. 'w' means write; a new file is created, an existing file with the same name is deleted. 'a' and 'r+' mean append (in analogy with serial files); an existing file is opened for reading and writing. Appending s to modes w, r+ or a will enable unbuffered shared access to NETCDF3_CLASSIC or NETCDF3_64BIT formatted files. Unbuffered access may be useful even if you don't need shared access, since it may be faster for programs that don't access data sequentially. This option is ignored for NETCDF4 and NETCDF4_CLASSIC formatted files. zlib : boolean, optional Default True if set netCDF compression will be used complevel : int, optional Default 4 compression level used from 1(low compression) to 9(high compression) """ def __init__(self, filename, name=None, file_format="NETCDF4", mode='r', zlib=True, complevel=4): self.dataset_name = name self.filename = filename self.file = None self.file_format = file_format self.buf_len = 0 self.global_attr = {} self.global_attr['id'] = os.path.split(self.filename)[1] s = "%Y-%m-%d %H:%M:%S" self.global_attr['date_created'] = datetime.datetime.now().strftime(s) if self.dataset_name is not None: self.global_attr['dataset_name'] = self.dataset_name self.zlib = zlib self.complevel = complevel self.mode = mode if self.mode == "a" and not os.path.exists(self.filename): self.mode = "w" if self.mode == 'w': path = os.path.dirname(self.filename) if not os.path.exists(path): os.makedirs(path) self.dataset = netCDF4.Dataset(self.filename, self.mode, format=self.file_format) def _set_global_attr(self): """ Write global attributes to NetCDF file. """ self.dataset.setncatts(self.global_attr) self.global_attr = {}
[docs] def create_dim(self, name, n): """ Create dimension for NetCDF file. if it does not yet exist Parameters ---------- name : str Name of the NetCDF dimension. n : int Size of the dimension. """ if name not in self.dataset.dimensions.keys(): self.dataset.createDimension(name, size=n)
[docs] def write_var(self, name, data=None, dim=None, attr={}, dtype=None, zlib=None, complevel=None, chunksizes=None, **kwargs): """ Create or overwrite values in a NetCDF variable. The data will be written to disk once flush or close is called Parameters ---------- name : str Name of the NetCDF variable. data : np.ndarray, optional Array containing the data. if not given then the variable will be left empty dim : tuple, optional A tuple containing the dimension names. attr : dict, optional A dictionary containing the variable attributes. dtype: data type, string or numpy.dtype, optional if not given data.dtype will be used zlib: boolean, optional explicit compression for this variable if not given then global attribute is used complevel: int, optional explicit compression level for this variable if not given then global attribute is used chunksizes : tuple, optional chunksizes can be used to manually specify the HDF5 chunksizes for each dimension of the variable. """ fill_value = None if '_FillValue' in attr: fill_value = attr.pop('_FillValue') if dtype is None: dtype = data.dtype if zlib is None: zlib = self.zlib if complevel is None: complevel = self.complevel if name in self.dataset.variables.keys(): var = self.dataset.variables[name] else: var = self.dataset.createVariable(name, dtype, dim, fill_value=fill_value, zlib=zlib, complevel=complevel, chunksizes=chunksizes, **kwargs) if data is not None: var[:] = data for attr_name, attr_value in attr.iteritems(): self.dataset.variables[name].setncattr(attr_name, attr_value)
[docs] def append_var(self, name, data): """ append data along unlimited dimension(s) of variable Parameters ---------- name : string name of variable to append to data : numpy.array numpy array of correct dimension Raises ------ IOError if appending to variable without unlimited dimension """ if name in self.dataset.variables.keys(): var = self.dataset.variables[name] dim_unlimited = [] key = [] for index, dim in enumerate(var.dimensions): unlimited = self.dataset.dimensions[dim].isunlimited() dim_unlimited.append(unlimited) if not unlimited: # if the dimension is not unlimited set the slice to : key.append(slice(None, None, None)) else: # if unlimited set slice of this dimension to # append meaning # [var.shape[index]:] key.append(slice(var.shape[index], None, None)) dim_unlimited = np.array(dim_unlimited) nr_unlimited = np.where(dim_unlimited)[0].size key = tuple(key) # if there are unlimited dimensions we can do an append if nr_unlimited > 0: var[key] = data else: raise IOError(''.join(('Cannot append to variable that ', 'has no unlimited dimension')))
[docs] def read_var(self, name): """ reads variable from netCDF file Parameters ---------- name : string name of the variable """ if self.mode in ['r', 'r+']: if name in self.dataset.variables.keys(): return self.dataset.variables[name][:]
[docs] def add_global_attr(self, name, value): self.global_attr[name] = value
[docs] def flush(self): if self.dataset is not None: if self.mode in ['w', 'r+']: self._set_global_attr() self.dataset.sync()
[docs] def close(self): if self.dataset is not None: self.flush() self.dataset.close() self.dataset = None
def __enter__(self): return self def __exit__(self, value_type, value, traceback): self.close()
[docs]class OrthoMultiTs(Dataset): """ Implementation of the Orthogonal multidimensional array representation of time series according to the NetCDF CF-conventions 1.6. Parameters ---------- filename : string filename of netCDF file. If already exiting then it will be opened as read only unless the append keyword is set. if the overwrite keyword is set then the file will be overwritten n_loc : int, optional number of locations that this netCDF file contains time series for only required for new file loc_dim_name : string, optional name of the location dimension obs_dim_name : string, optional name of the observations dimension loc_ids_name : string, optional name of variable that has the location id's stored loc_descr_name : string, optional name of variable that has additional location information stored time_units : string, optional units the time axis is given in. Default: "days since 1900-01-01 00:00:00" time_var : string, optional name of time variable Default: time lat_var : string, optional name of latitude variable Default: lat lon_var : string, optional name of longitude variable Default: lon alt_var : string, optional name of altitude variable Default: alt unlim_chunksize : int, optional chunksize to use along unlimited dimensions, other chunksizes will be calculated by the netCDF library read_bulk : boolean, optional if set to True the data of all locations is read into memory, and subsequent calls to read_ts read from the cache and not from disk this makes reading complete files faster# read_dates : boolean, optional if false dates will not be read automatically but only on specific request useable for bulk reading because currently the netCDF num2date routine is very slow for big datasets """ def __init__(self, filename, n_loc=None, loc_dim_name='locations', obs_dim_name='time', loc_ids_name='location_id', loc_descr_name='location_description', time_units="days since 1900-01-01 00:00:00", time_var='time', lat_var='lat', lon_var='lon', alt_var='alt', unlim_chunksize=None, read_bulk=False, read_dates=True, **kwargs): super(OrthoMultiTs, self).__init__(filename, **kwargs) self.n_loc = n_loc self.obs_dim_name = obs_dim_name self.loc_ids_name = loc_ids_name self.loc_dim_name = loc_dim_name self.loc_descr_name = loc_descr_name self.time_var = time_var self.lat_var = lat_var self.lon_var = lon_var self.alt_var = alt_var self.time_units = time_units self.unlim_chunksize = unlim_chunksize if unlim_chunksize is not None: self.unlim_chunksize = [unlim_chunksize] # variable to track write operations self.write_operations = 0 self.write_offset = None # variable which lists the variables that should not be # considered time series even if they have the correct dimension self.not_timeseries = [self.time_var] # initialize dimensions and index_variable if self.mode == 'w': self._init_dimensions_and_lookup() self._init_location_variables() self._init_location_id_and_time() self.global_attr['featureType'] = 'timeSeries' # index, to be read upon first reading operation self.index = None # date variables, for OrthogonalMulitTs it can be stored # since it is the same for all variables in a file self.constant_dates = True self.dates = None self.read_dates_auto = read_dates self.read_bulk = read_bulk # if read bulk is activated the arrays will # be read into the local variables dict # if it is not activated the data will be read # from the netCDF variables if not self.read_bulk: self.variables = self.dataset.variables else: self.variables = {} def _init_dimensions_and_lookup(self): """ Initializes the dimensions and variables for the lookup between locations and entries in the time series """ if self.n_loc is None: raise ValueError('Number of locations ' 'have to be set for new file') self.create_dim(self.loc_dim_name, self.n_loc) self.create_dim(self.obs_dim_name, None) def _init_location_id_and_time(self): """ initialize the dimensions and variables that are the basis of the format """ # make variable that contains the location id self.write_var(self.loc_ids_name, data=None, dim=self.loc_dim_name, dtype=np.int) self.write_var(self.loc_descr_name, data=None, dim=self.loc_dim_name, dtype='str') # initialize time variable self.write_var(self.time_var, data=None, dim=self.obs_dim_name, attr={'standard_name': 'time', 'long_name': 'time of measurement', 'units': self.time_units}, dtype=np.double, chunksizes=self.unlim_chunksize) def _init_location_variables(self): # write station information, longitude, latitude and altitude self.write_var(self.lon_var, data=None, dim=self.loc_dim_name, attr={'standard_name': 'longitude', 'long_name': 'location longitude', 'units': 'degrees_east', 'valid_range': (-180.0, 180.0)}, dtype=np.float) self.write_var(self.lat_var, data=None, dim=self.loc_dim_name, attr={'standard_name': 'latitude', 'long_name': 'location latitude', 'units': 'degrees_north', 'valid_range': (-90.0, 90.0)}, dtype=np.float) self.write_var(self.alt_var, data=None, dim=self.loc_dim_name, attr={'standard_name': 'height', 'long_name': 'vertical distance above the ' 'surface', 'units': 'm', 'positive': 'up', 'axis': 'Z'}, dtype=np.float) def _read_index(self): self.index = self.dataset.variables[self.loc_ids_name][:] def _find_free_index_pos(self): """ if the index is not yet filled completely this function gets the id of the first free position This function depends on the masked array being used if no data is yet in the file Returns ------- idx : int first free index position Raises ------ DatasetError if no free index is found """ if self.index is None: self._read_index() masked = np.where(self.index.mask)[0] # all indexes already filled if len(masked) == 0: raise DatasetError('No free index available') else: idx = np.min(masked) return idx def _get_loc_index(self, loc_id): """ gets index of location id in index variable """ if self.index is None: self._read_index() loc_ix = np.where(loc_id == self.index)[0] if loc_ix.size != 1: raise IOError('Index problem %d elements found' % loc_ix.size) return loc_ix[0] def _add_location(self, loc_id, lon, lat, alt=None, loc_descr=None): """ add a new location to the dataset Paramters --------- loc_id : int or numpy.array location id lon : float or numpy.array longitudes of location lat : float or numpy.array longitudes of location alt : float or numpy.array altitude of location loc_descr : string or numpy.array location description """ if type(loc_id) != np.ndarray: loc_id = np.array([loc_id]) if type(lon) != np.ndarray: lon = np.array([lon]) # netCDF library can not handle arrays of length 1 that contain onla a # None value if lon.size == 1 and lon[0] is None: lon = None if type(lat) != np.ndarray: lat = np.array([lat]) # netCDF library can not handle arrays of length 1 that contain onla a # None value if lat.size == 1 and lat[0] is None: lat = None if alt is not None: if type(alt) != np.ndarray: alt = np.array([alt]) # netCDF library can not handle arrays of length 1 that contain # onla a None value if alt.size == 1 and alt[0] is None: alt = None # remove location id's that are already in the file locations = np.ma.compressed( self.dataset.variables[self.loc_ids_name][:]) loc_count = len(locations) if loc_count > 0: loc_ids_new = np.invert(np.in1d(loc_id, locations)) if len(np.nonzero(loc_ids_new)[0]) == 0: # no new locations to add return None else: loc_ids_new = slice(None, None, None) idx = self._find_free_index_pos() index = np.arange(len(loc_id[loc_ids_new])) + idx self.dataset.variables[self.loc_ids_name][index] = loc_id[loc_ids_new] if lon is not None: self.dataset.variables[self.lon_var][index] = lon[loc_ids_new] if lat is not None: self.dataset.variables[self.lat_var][index] = lat[loc_ids_new] if alt is not None: self.dataset.variables[self.alt_var][index] = alt[loc_ids_new] if loc_descr is not None: if type(loc_descr) != np.ndarray: loc_descr = np.array(loc_descr) if len(index) == 1: index = int(index[0]) loc_ids_new = 0 self.dataset.variables[self.loc_descr_name][ index] = str(loc_descr) else: self.dataset.variables[self.loc_descr_name][ index] = loc_descr[loc_ids_new].astype(object) # update index variable after adding location self.index[index] = np.ma.asarray(loc_id) return idx def _get_all_ts_variables(self): """ gets all variable names that have the self.obs_dim_name as only dimension indicating that they are time series observations. This does not include the self.time_var variable Returns ------- variables : list list of variable names """ ts_var = [] for variable_name in self.dataset.variables: if variable_name not in self.not_timeseries: if self.obs_dim_name in \ self.dataset.variables[variable_name].dimensions: ts_var.append(variable_name) return ts_var def _get_index_of_ts(self, loc_id): """ get the indes of a time series """ try: loc_ix = self._get_loc_index(loc_id) except IOError: msg = "".join(("Time series for Location #", loc_id.__str__(), " not found.")) raise IOError(msg) # [loc_ix,:] _slice = (loc_ix, slice(None, None, None)) return _slice def _get_loc_ix_from_obs_ix(self, obs_ix): """ Get location index from observation index. In case of OrthoMultiTs all measurements are taken at the same time and therefore all location id's are affected. Parameters ---------- obs_ix : int Observation index. Returns ------- loc_ix : int Location index. """ return self.read_var(self.loc_ids_name)
[docs] def read_time(self, loc_id): """ read the time stamps for the given location id in this case the location id is irrelevant since they all have the same timestamps """ return self.dataset.variables[self.time_var][:]
[docs] def read_dates(self, loc_id): self.dates = netCDF4.num2date(self.read_time(loc_id), units=self.dataset.variables[ self.time_var].units, calendar='standard') return self.dates
def _read_var_ts(self, loc_id, var): """ read a time series of a variable at a given location id Parameters ---------- loc_id : int id of location, can be a grid point id or some other id var : string name of variable to read """ index = self._get_index_of_ts(loc_id) if self.read_bulk: if var not in self.variables.keys(): self.variables[var] = self.dataset.variables[var][:] return self.variables[var][index]
[docs] def read_ts(self, variables, loc_id, dates_direct=False): """ reads time series of variables Parameters ---------- variables : list or string loc_id : int location_id dates_direct : boolean, optional if True the dates are read directly from the netCDF file without conversion to datetime """ if type(variables) != list: variables = [variables] ts = {} for variable in variables: data = self._read_var_ts(loc_id, variable) ts[variable] = data if not dates_direct: # only read dates if they should be read automatically if self.read_dates_auto: # only read dates if they have not been read # or if they are different for each location id which is # the case if self.constant_dates is set to False if self.dates is None: self.read_dates(loc_id) if not self.constant_dates: self.read_dates(loc_id) ts['time'] = self.dates else: if self.read_dates_auto: # only read dates if they have not been read # or if they are different for each location id which is # the case if self.constant_dates is set to False ts['time'] = self.read_time(loc_id) return ts
[docs] def read_all_ts(self, loc_id, dates_direct=False): """ read a time series of all time series variables at a given location id Parameters ---------- loc_id : int id of location, can be a grid point id or some other id dates_direct : boolean, optional if True the dates are read directly from the netCDF file without conversion to datetime Returns ------- time_series : dict keys of var and time with numpy.arrays as values """ ts = self.read_ts(self._get_all_ts_variables(), loc_id) return ts for variable in self._get_all_ts_variables(): data = self._read_var_ts(loc_id, variable) ts[variable] = data if not dates_direct: # only read dates if they should be read automatically if self.read_dates_auto: # only read dates if they have not been read # or if they are different for each location id which is i # the case if self.constant_dates is set to False if self.dates is None: self.read_dates(loc_id) if not self.constant_dates: self.read_dates(loc_id) ts['time'] = self.dates else: if self.read_dates_auto: # only read dates if they have not been read # or if they are different for each location id which is # the case if self.constant_dates is set to False ts['time'] = self.read_time(loc_id) return ts
[docs] def extend_time(self, dates, direct=False): """ Extend the time dimension and variable by the given dates Parameters ---------- dates : numpy.array of datetime objects or floats direct : boolean if true the dates are already converted into floating point number of correct magnitude """ if direct: self.append_var(self.time_var, dates) else: self.append_var(self.time_var, netCDF4.date2num(dates, units=self.dataset.variables[ self.time_var].units, calendar='standard'))
[docs] def write_ts(self, loc_id, data, dates, extend_time='first', loc_descr=None, lon=None, lat=None, alt=None, fill_values=None, attributes=None, dates_direct=False): """ write time series data, if not yet existing also add location to file for this data format it is assumed that in each write/append cycle the same amount of data is added. Parameters ---------- loc_id : int location id data : dict dictionary with variable names as keys and numpy.arrays as values dates: numpy.array array of datetime objects extend_time : string or boolean, optional one of 'first', True or False 'first' : only extend the time variable on the first write operation after opening the file True : extend the time variable when writing False : only write variable and ignore dates the assumption is that the time variable already has the correct length and content attributes : dict, optional dictionary of attributes that should be added to the netCDF variables. can also be a dict of dicts for each variable name as in the data dict. dates_direct : boolean if true the dates are already converted into floating point number of correct magnitude """ try: idx = self._get_loc_index(loc_id) except IOError: idx = self._add_location(loc_id, lon, lat, alt, loc_descr) # find out if attributes is a dict to be used for all variables or if # there is a dictionary of attributes for each variable unique_attr = False if attributes is not None: if sorted(data.keys()) == sorted(attributes.keys()): unique_attr = True for key in data: if data[key].size != dates.size: raise DatasetError("".join(("timestamps and dataset %s ", "must have the same size" % key))) # add to time variable only on the first write operation if ((self.write_operations == 0 and extend_time == 'first') or (type(extend_time) == bool and extend_time)): self.length_before_extend = self.dataset.variables[ self.time_var].size self.extend_time(dates, direct=dates_direct) for key in data: internal_attributes = {'name': key, 'coordinates': 'lat lon alt'} if type(fill_values) == dict: internal_attributes['_FillValue'] = fill_values[key] if attributes is not None: if unique_attr: variable_attributes = attributes[key] else: variable_attributes = attributes internal_attributes.update(variable_attributes) if self.unlim_chunksize is None: chunksizes = None else: chunksizes = [self.n_loc, self.unlim_chunksize[0]] self.write_var(key, data=None, dim=(self.loc_dim_name, self.obs_dim_name), attr=internal_attributes, dtype=data[key].dtype, chunksizes=chunksizes) if self.write_offset is None: # find start of elements that are not yet filled with values _slice_new = slice(self.length_before_extend, None, None) masked = \ np.where( self.dataset.variables[key][idx, _slice_new].mask)[0] # all indexes already filled if len(masked) == 0: raise DatasetError('No free data slots available') else: self.write_offset = np.min( masked) + self.length_before_extend _slice = slice(self.write_offset, None, None) # has to be reshaped to 2 dimensions because it is written # into 2d variable otherwise netCDF library gets confused, # might be a bug in netCDF? self.dataset.variables[key][idx, _slice] = \ data[key].reshape(1, data[key].size) self.write_operations += 1
[docs] def write_ts_all_loc(self, loc_ids, data, dates, loc_descrs=None, lons=None, lats=None, alts=None, fill_values=None, attributes=None, dates_direct=False): """ write time series data in bulk, for this the user has to provide a 2D array with dimensions (self.nloc, dates) that is filled with the time series of all grid points in the file Parameters ---------- loc_ids : numpy.array location ids along the first axis of the data array data : dict dictionary with variable names as keys and 2D numpy.arrays as values dates: numpy.array array of datetime objects with same size as second dimension of data arrays attributes : dict, optional dictionary of attributes that should be added to the netCDF variables. can also be a dict of dicts for each variable name as in the data dict. dates_direct : boolean if true the dates are already converted into floating point number of correct magnitude """ if self.n_loc != loc_ids.size: raise ValueError(''.join(('loc_ids is not the same number of ', 'locations in the file'))) for key in data: if data[key].shape[1] != dates.size: raise DatasetError("".join(("timestamps and dataset ", "second dimension %s must have ", " the same size" % key))) if data[key].shape[0] != self.n_loc: raise DatasetError("".join(("datasets first dimension ", "%s must have the same size as ", "number of locations ", "in the file" % key))) # make sure zip works even if one of the parameters is not given if lons is None: lons = np.repeat(None, self.n_loc) if lats is None: lats = np.repeat(None, self.n_loc) if alts is None: alts = np.repeat(None, self.n_loc) # find out if attributes is a dict to be used for all variables or if # there is a dictionary of attributes for each variable unique_attr = False if attributes is not None: if sorted(data.keys()) == sorted(attributes.keys()): unique_attr = True self._add_location(loc_ids, lons, lats, alts, loc_descrs) for key in data: internal_attributes = {'name': key, 'coordinates': 'lat lon alt'} if type(fill_values) == dict: internal_attributes['_FillValue'] = fill_values[key] if attributes is not None: if unique_attr: variable_attributes = attributes[key] else: variable_attributes = attributes internal_attributes.update(variable_attributes) if self.unlim_chunksize is None: chunksizes = None else: chunksizes = [self.n_loc, self.unlim_chunksize[0]] self.write_var(key, data=None, dim=(self.loc_dim_name, self.obs_dim_name), attr=internal_attributes, dtype=data[key].dtype, chunksizes=chunksizes) if self.write_offset is None: # current shape tells us how many elements are already # in the file self.write_offset = self.dataset.variables[key].shape[1] _slice = slice(self.write_offset, self.write_offset + dates.size, None) self.dataset.variables[key][:, _slice] = data[key] # fill up time variable if dates_direct: self.dataset.variables[self.time_var][self.write_offset:] = dates else: self.dataset.variables[self.time_var][self.write_offset:] = \ netCDF4.date2num(dates, units=self.dataset.variables[ self.time_var].units, calendar='standard')
[docs]class ContiguousRaggedTs(OrthoMultiTs): """ Class that represents a Contiguous ragged array representation of time series according to NetCDF CF-conventions 1.6. Parameters ---------- filename : string filename of netCDF file. If already exiting then it will be opened as read only unless the append keyword is set. if the overwrite keyword is set then the file will be overwritten n_loc : int, optional number of locations that this netCDF file contains time series for only required for new file n_obs : int, optional how many observations will be saved into this netCDF file in total only required for new file obs_loc_lut : string, optional variable name in the netCDF file that contains the lookup between observations and locations loc_dim_name : string, optional name of the location dimension obs_dim_name : string, optional name of the observations dimension loc_ids_name : string, optional name of variable that has the location id's stored loc_descr_name : string, optional name of variable that has additional location information stored time_units : string, optional units the time axis is given in. Default: "days since 1900-01-01 00:00:00" time_var : string, optional name of time variable Default: time lat_var : string, optional name of latitude variable Default: lat lon_var : string, optional name of longitude variable Default: lon alt_var : string, optional name of altitude variable Default: alt """ def __init__(self, filename, n_loc=None, n_obs=None, obs_loc_lut='row_size', obs_dim_name='obs', **kwargs): self.n_obs = n_obs self.obs_loc_lut = obs_loc_lut super(ContiguousRaggedTs, self).__init__(filename, n_loc=n_loc, obs_dim_name=obs_dim_name, ** kwargs) self.constant_dates = False def _init_dimensions_and_lookup(self): """ Initializes the dimensions and variables for the lookup between locations and entries in the time series """ if self.n_loc is None: raise ValueError('Number of locations ' 'have to be set for new file') if self.n_obs is None: raise ValueError('Number of observations have ' 'to be set for new file') self.create_dim(self.loc_dim_name, self.n_loc) self.create_dim(self.obs_dim_name, self.n_obs) attr = {'long_name': 'number of observations at this location', 'sample_dimension': self.obs_dim_name} self.write_var(self.obs_loc_lut, data=None, dim=self.loc_dim_name, dtype=np.int, attr=attr, chunksizes=self.unlim_chunksize) def _get_index_of_ts(self, loc_id): """ Parameters ---------- loc_id: int Raises ------ IOError if location id could not be found """ try: loc_ix = self._get_loc_index(loc_id) except IOError: msg = "".join(("Time series for Location #", loc_id.__str__(), " not found.")) raise IOError(msg) start = np.sum(self.variables[self.obs_loc_lut][:loc_ix]) end = np.sum(self.variables[self.obs_loc_lut][:loc_ix + 1]) return slice(start, end) def _get_loc_ix_from_obs_ix(self, obs_ix): """ Get location index from observation index. Parameters ---------- obs_ix : int Observation index. Returns ------- loc_ix : int Location index. """ bins = np.hstack((0, np.cumsum(self.variables[self.obs_loc_lut]))) loc_ix = np.digitize(obs_ix, bins) - 1 return loc_ix
[docs] def read_time(self, loc_id): """ read the time stamps for the given location id in this case it works like a normal time series variable """ return self._read_var_ts(loc_id, self.time_var)
[docs] def write_ts(self, loc_id, data, dates, loc_descr=None, lon=None, lat=None, alt=None, fill_values=None, attributes=None, dates_direct=False): """ write time series data, if not yet existing also add location to file Parameters ---------- loc_id : int location id data : dict dictionary with variable names as keys and numpy.arrays as values dates: numpy.array array of datetime objects attributes : dict, optional dictionary of attributes that should be added to the netCDF variables. can also be a dict of dicts for each variable name as in the data dict. dates_direct : boolean if true the dates are already converted into floating point number of correct magnitude """ try: idx = self._get_loc_index(loc_id) except IOError: idx = self._add_location(loc_id, lon, lat, alt, loc_descr) # find out if attributes is a dict to be used for all variables or if # there is a dictionary of attributes for each variable unique_attr = False if attributes is not None: if sorted(data.keys()) == sorted(attributes.keys()): unique_attr = True for key in data: if data[key].size != dates.size: raise DatasetError("".join(("timestamps and dataset %s ", "must have the same size" % key))) # add number of new elements to index_var self.dataset.variables[self.obs_loc_lut][idx] = dates.size index = self._get_index_of_ts(loc_id) for key in data: internal_attributes = {'name': key, 'coordinates': 'time lat lon alt'} if type(fill_values) == dict: internal_attributes['_FillValue'] = fill_values[key] if attributes is not None: if unique_attr: variable_attributes = attributes[key] else: variable_attributes = attributes internal_attributes.update(variable_attributes) self.write_var(key, data=None, dim=self.obs_dim_name, attr=internal_attributes, dtype=data[key].dtype, chunksizes=self.unlim_chunksize) self.dataset.variables[key][index] = data[key] if dates_direct: self.dataset.variables[self.time_var][index] = dates else: self.dataset.variables[self.time_var][index] = \ netCDF4.date2num(dates, units=self.dataset.variables[ self.time_var].units, calendar='standard')
[docs]class IndexedRaggedTs(ContiguousRaggedTs): """ Class that represents a Indexed ragged array representation of time series according to NetCDF CF-conventions 1.6. """ def __init__(self, filename, n_loc=None, obs_loc_lut='locationIndex', **kwargs): # set n_obs to None for unlimited dimension super(IndexedRaggedTs, self).__init__(filename, n_loc=n_loc, n_obs=None, obs_loc_lut=obs_loc_lut, **kwargs) self.not_timeseries.append(self.obs_loc_lut) self.constant_dates = False def _init_dimensions_and_lookup(self): """ Initializes the dimensions and variables for the lookup between locations and entries in the time series """ if self.n_loc is None: raise ValueError('Number of locations ' 'have to be set for new file') self.create_dim(self.loc_dim_name, self.n_loc) self.create_dim(self.obs_dim_name, self.n_obs) attr = {'long_name': 'which location this observation is for', 'instance_dimension': self.loc_dim_name} self.write_var(self.obs_loc_lut, data=None, dim=self.obs_dim_name, dtype=np.int, attr=attr, chunksizes=self.unlim_chunksize) def _get_index_of_ts(self, loc_id): """ Parameters ---------- loc_id: int Location index. Raises ------ IOError if location id could not be found """ try: loc_ix = self._get_loc_index(loc_id) except IOError: msg = "".join(("Time series for Location #", loc_id.__str__(), " not found.")) raise IOError(msg) index = np.where(self.variables[self.obs_loc_lut] == loc_ix)[0] if len(index) == 0: msg = "".join(("Time series for Location #", loc_id.__str__(), " not found.")) raise IOError(msg) return index def _get_loc_ix_from_obs_ix(self, obs_ix): """ Get location index from observation index. Parameters ---------- obs_ix : int Observation index. Returns ------- loc_ix : int Location index. """ return self.variables[self.obs_loc_lut][obs_ix]
[docs] def write_ts(self, loc_id, data, dates, loc_descr=None, lon=None, lat=None, alt=None, fill_values=None, attributes=None, dates_direct=False): """ write time series data, if not yet existing also add location to file Parameters ---------- loc_id : int location id data : dict dictionary with variable names as keys and numpy.arrays as values dates: numpy.array array of datetime objects attributes : dict, optional dictionary of attributes that should be added to the netCDF variables. can also be a dict of dicts for each variable name as in the data dict. dates_direct : boolean if true the dates are already converted into floating point number of correct magnitude """ try: idx = self._get_loc_index(loc_id) except IOError: idx = self._add_location(loc_id, lon, lat, alt, loc_descr) # find out if attributes is a dict to be used for all variables or if # there is a dictionary of attributes for each variable unique_attr = False if attributes is not None: if sorted(data.keys()) == sorted(attributes.keys()): unique_attr = True for key in data: if data[key].size != dates.size: raise DatasetError("".join(("timestamps and dataset %s ", "must have the same size" % key))) # add number of new elements to index_var indices = np.empty_like(dates, dtype=np.int) indices.fill(idx) self.append_var(self.obs_loc_lut, indices) index = self._get_index_of_ts(loc_id) # if this is the case the data is an append and only the last # dates.size elements are needed if index.size > dates.size: index = index[index.size - dates.size:] for key in data: internal_attributes = {'name': key, 'coordinates': 'time lat lon alt'} if type(fill_values) == dict: internal_attributes['_FillValue'] = fill_values[key] if attributes is not None: if unique_attr: variable_attributes = attributes[key] else: variable_attributes = attributes internal_attributes.update(variable_attributes) # does nothing if variable exists already self.write_var(key, data=None, dim=self.obs_dim_name, attr=internal_attributes, dtype=data[key].dtype, chunksizes=self.unlim_chunksize) self.dataset.variables[key][index] = data[key] if dates_direct: self.dataset.variables[self.time_var][index] = dates else: self.dataset.variables[self.time_var][index] = \ netCDF4.date2num(dates, units=self.dataset.variables[ self.time_var].units, calendar='standard')
[docs]class NetCDFGriddedTS(dsbase.DatasetTSBase): """ Base class for reading time series data on a cell grid written with one of the netCDF writers in general.io.netcdf Parameters ---------- grid : grid object that implements find_nearest_gpi() and gpi2cell() read_bulk : boolean, optional if true read_bulk will be activated data_path : string, optional path to the data directory parameters : list, optional if given only parameters from this list will be read ioclass : class class to use to read the data offsets : dict, optional offset to apply to a variable, in addition to the offset specified in the netCDF file scale_factors : dict, optional scale factors to apply to a variable """ def __init__(self, grid=None, read_bulk=False, data_path=None, parameters=None, ioclass=None, cell_filename_template='%04d.nc', offsets=None, scale_factors=None): self.parameters = parameters self.ioclass = ioclass self.netcdf_obj = None self.cell_file_templ = cell_filename_template self.read_bulk = read_bulk self.previous_cell = None self.offsets = offsets self.scale_factors = scale_factors if self.ioclass == OrthoMultiTs: self.read_dates = False else: self.read_dates = True self.dates = None super(NetCDFGriddedTS, self).__init__(data_path, grid)
[docs] def read_gp(self, gpi, period=None): """ Method reads data for given gpi Parameters ---------- gpi : int grid point index on dgg grid period : list 2 element array containing datetimes [start,end] Returns ------- ts : pandas.DataFrame time series """ cell = self.grid.gpi2cell(gpi) filename = os.path.join(self.path, self.cell_file_templ % cell) if self.read_bulk: if self.previous_cell != cell: # print "Switching cell to %04d reading gpi %d" % (cell, gpi) if self.netcdf_obj is not None: self.netcdf_obj.close() self.netcdf_obj = None self.netcdf_obj = self.ioclass(filename, read_bulk=self.read_bulk, read_dates=self.read_dates) self.previous_cell = cell else: if self.netcdf_obj is not None: self.netcdf_obj.close() self.netcdf_obj = None self.netcdf_obj = self.ioclass(filename, read_bulk=self.read_bulk, read_dates=self.read_dates) if self.parameters is None: data = self.netcdf_obj.read_all_ts(gpi) else: data = self.netcdf_obj.read_ts(self.parameters, gpi) if self.dates is None or self.read_dates: self.dates = self.netcdf_obj.read_dates(gpi) time = self.dates # remove time column from dataframe, only index should contain time try: data.pop('time') except KeyError: # if the time value is not found then do nothing pass ts = pd.DataFrame(data, index=time) if period is not None: ts = ts[period[0]:period[1]] if self.scale_factors is not None: for scale_column in self.scale_factors: if scale_column in ts.columns: ts[scale_column] *= self.scale_factors[scale_column] if self.offsets is not None: for offset_column in self.offsets: if offset_column in ts.columns: ts[offset_column] += self.offsets[offset_column] if not self.read_bulk: self.netcdf_obj.close() self.netcdf_obj = None return ts
[docs]class netCDF2DImageStack(Dataset): """ Class for writing stacks of 2D images into netCDF. """ def __init__(self, filename, grid=None, times=None, mode='r', name=''): self.grid = grid if len(grid.shape) != 2: raise ValueError("grid must be 2D grid for a imagestack") self.filename = filename self.times = times self.variables = [] self.time_var = 'time' self.time_units = "days since 1900-01-01" self.time_chunksize = 1 self.lon_chunksize = 1 self.lat_chunksize = len(self.grid.latdim) super(netCDF2DImageStack, self).__init__(filename, name=name, mode=mode) if self.mode == 'w': self._init_dimensions() self._init_time() self._init_location_variables() elif self.mode in ['a', 'r']: self._load_grid() self._load_variables() def _init_dimensions(self): self.create_dim('lon', len(self.grid.londim)) self.create_dim('lat', len(self.grid.latdim)) self.create_dim('time', len(self.times)) def _load_grid(self): lons = self.dataset.variables['lon'][:] lats = self.dataset.variables['lat'][:] self.grid = grids.gridfromdims(lons, lats) def _load_variables(self): for var in self.dataset.variables: if self.dataset.variables[var].dimensions == ('time', 'lat', 'lon'): self.variables.append(var) def _init_time(self): """ initialize the dimensions and variables that are the basis of the format """ # initialize time variable time_data = netCDF4.date2num(self.times, self.time_units) self.write_var(self.time_var, data=time_data, dim='time', attr={'standard_name': 'time', 'long_name': 'time of measurement', 'units': self.time_units}, dtype=np.double, chunksizes=[self.time_chunksize]) def _init_location_variables(self): # write station information, longitude, latitude and altitude self.write_var('lon', data=self.grid.londim, dim='lon', attr={'standard_name': 'longitude', 'long_name': 'location longitude', 'units': 'degrees_east', 'valid_range': (-180.0, 180.0)}, dtype=np.float) self.write_var('lat', data=self.grid.latdim, dim='lat', attr={'standard_name': 'latitude', 'long_name': 'location latitude', 'units': 'degrees_north', 'valid_range': (-90.0, 90.0)}, dtype=np.float)
[docs] def init_variable(self, var): self.write_var(var, data=None, dim=('time', 'lat', 'lon'), dtype=np.float, attr={'_FillValue': -9999.})
[docs] def write_ts(self, gpi, data): """ write a time series into the imagestack at the given gpi Parameters ---------- self: type description gpi: int or numpy.array grid point indices to write to data: dictionary dictionary of int or numpy.array for each variable that should be written shape must be (len(gpi), len(times)) """ gpi = np.atleast_1d(gpi) for i, gp in enumerate(gpi): row, column = self.grid.gpi2rowcol(gp) for var in data: if var not in self.variables: self.variables.append(var) self.init_variable(var) self.dataset.variables[var][ :, row, column] = np.atleast_2d(data[var])[i, :]
def __setitem__(self, gpi, data): """ write a time series into the imagestack at the given gpi Parameters ---------- self: type description gpi: int or numpy.array grid point indices to write to data: dictionary dictionary of int or numpy.array for each variable that should be written shape must be (len(gpi), len(times)) """ gpi = np.atleast_1d(gpi) for i, gp in enumerate(gpi): row, column = self.grid.gpi2rowcol(gp) for var in data: if var not in self.variables: self.variables.append(var) self.init_variable(var) self.dataset.variables[var][ :, row, column] = np.atleast_2d(data[var])[i, :] def __getitem__(self, key): gpi = np.atleast_1d(key) data = {} for i, gp in enumerate(gpi): row, column = self.grid.gpi2rowcol(gp) for var in self.variables: data[var] = self.dataset.variables[var][ :, row, column] return pd.DataFrame(data, index=self.times)