= get_config().config_file.parent
project_root = project_root / 'test_data/WienerHammerstein'
f_path = get_files(f_path,extensions='.hdf5',recurse=True)
hdf_files = DataBlock(blocks=(SequenceBlock.from_hdf(['u','y'],TensorSequencesInput,padding=True,cached=None),
seq 'y'],TensorSequencesOutput,cached=None)),
SequenceBlock.from_hdf([=CreateDict([DfHDFCreateWindows(win_sz=100+1,stp_sz=100,clm='u')]),
get_items=ApplyToDict(ParentSplitter()))
splitter= seq.dataloaders(hdf_files) dls
Corefunctions
creating a Dataloader manually for each Dataset is cumbersome. The flexibility of the datablocks api is not needed when we can take some assumptions. 1. The individual .hdf5 files have a “train”, “valid” and “test” parent directory which divides them 2. All input and output signals are 1d and have the same length
To implement this function, we need to store the normalization parameters of a dataset into a file, so a trained model can work with the dataset even when it is loaded again.
We assume that the dataloader always provides normalized data. When the model is deployed, it has to normalize the data itself.
extract_mean_std_from_dls
extract_mean_std_from_dls (dls)
extract_mean_std_from_dls(dls)
(TensorSequencesInput([[[-0.0045, -0.0413]]]),
TensorSequencesInput([[[0.6732, 0.2405]]]))
type(extract_mean_std_from_dls(dls)),tuple)
test_eq(len(extract_mean_std_from_dls(dls)),2) test_eq(
dict_file_save
dict_file_save (key, value, f_path='dls_normalize.p')
save value to a dictionary file, appends if it already exists
= [1,2,5]
val 'tst_key',val) dict_file_save(
dict_file_load
dict_file_load (key, f_path='dls_normalize.p')
load value from a dictionary file
'tst_key'),val) test_eq(dict_file_load(
extract_mean_std_from_hdffiles
extract_mean_std_from_hdffiles (lst_files, lst_signals)
Calculate the mean and standard deviation of the signals from the provided HDF5 files.
Details | |
---|---|
lst_files | List of paths to HDF5 files |
lst_signals | List of signal names, each a dataset within the HDF5 files |
'u','y']) extract_mean_std_from_hdffiles(hdf_files,[
(array([-0.00387449, -0.03890106], dtype=float32),
array([0.65983725, 0.23847243], dtype=float32))
extract_mean_std_from_dataset
extract_mean_std_from_dataset (lst_files, u, x, y)
'u'],[],['y']) extract_mean_std_from_dataset(hdf_files,[
((array([-0.00387546], dtype=float32), array([0.6569116], dtype=float32)),
(None, None),
(array([-0.0389192], dtype=float32), array([0.23567446], dtype=float32)))
is_dataset_directory
is_dataset_directory (ds_path)
*Checks if the given directory path is a dataset with hdf5 files.
:param ds_path: The path to the directory to check. :return: True if the directory contains ‘train’, ‘valid’, and ‘test’ subdirectories, each of which must contain at least one HDF5 file. False otherwise.*
True)
test_eq(is_dataset_directory(f_path),False) test_eq(is_dataset_directory(f_path.parent),
create_dls
create_dls (u, y, dataset:pathlib.Path|list, win_sz:int=100, x:list=[], stp_sz:int=1, sub_seq_len:int=None, bs:int=64, prediction:bool=False, input_delay:bool=False, valid_stp_sz:int=None, cached:bool=True, num_workers:int=5, max_batches_training:int=300, max_batches_valid:int=None, dls_id:str=None)
Type | Default | Details | |
---|---|---|---|
u | list of input signal names | ||
y | list of output signal names | ||
dataset | pathlib.Path | list | path to dataset with train,valid and test folders, or list of filepaths | |
win_sz | int | 100 | initial window size |
x | list | [] | optional list of state signal names |
stp_sz | int | 1 | step size between consecutive windows |
sub_seq_len | int | None | if provided uses truncated backpropagation throug time with this sub sequence length |
bs | int | 64 | batch size |
prediction | bool | False | if true, the output is concatenated to the input, mainly for prediction tasks |
input_delay | bool | False | if true, the input is delayed by one step |
valid_stp_sz | int | None | step size between consecutive validation windows, defaults to win_sz |
cached | bool | True | if true, the data is cached in RAM |
num_workers | int | 5 | number of processes for the dataloader, 0 for no multiprocessing |
max_batches_training | int | 300 | limits the number of training batches in a single epoch |
max_batches_valid | int | None | limits the number of validation batches in a single epoch |
dls_id | str | None | identifier for the dataloader to cache the normalization values, does not cache when not provided |
= create_dls(u=['u'],y=['y'],dataset=f_path,dls_id='wh')
dls =1)
dls.show_batch(max_n'Training Dataloader Simulation')
plt.title(2].show_batch(max_n=1)
dls['Test Dataloader Simulation') plt.title(
Text(0.5, 1.0, 'Test Dataloader Simulation')
= create_dls(u=['u'],y=['y'],dataset=f_path,dls_id='wh',prediction=True)
dls =1)
dls.show_batch(max_n'Training Dataloader Prediction')
plt.title(2].show_batch(max_n=1)
dls['Test Dataloader Prediction') plt.title(
Text(0.5, 1.0, 'Test Dataloader Prediction')
# def loader_factory(**defaults):
# @delegates(create_dataloader)
# def loader(**kwargs):
# combined_args = {**defaults, **kwargs}
# return create_dataloader(**combined_args)
# return loader
# create_dls_test = loader_factory(
# u=['u'],y=['y'],
# dataset=f_path,
# win_sz=100,
# stp_sz=100
# )
= create_dls_test() dls
dls.show_batch()
import nbdev; nbdev.nbdev_export()