Source code for matdata.preprocess

# -*- coding: utf-8 -*-
"""
**Multiple Aspect Trajectory Tools Framework**

*MAT-data: Data Preprocessing for Multiple Aspect Trajectory Data Mining*

The present application offers a tool, to support the user in the classification task of multiple aspect trajectories,
specifically for extracting and visualizing the movelets, the parts of the trajectory that better discriminate a class.
It integrates into a unique platform the fragmented approaches available for multiple aspects trajectories and in
general for multidimensional sequence classification into a unique web-based and python library system. Offers both
movelets visualization and classification methods.

Created on Dec, 2023
Copyright (C) 2023, License GPL Version 3 or superior (see LICENSE file)

@author: Tarlis Portela

----

"""
import os
import pandas as pd
import numpy as np

from pathlib import Path
from tqdm.auto import tqdm
import glob2 as glob
import random

from sklearn.model_selection import KFold, train_test_split

from matdata.converter import *
#from .inc.script_def import getDescName

DS_FUNCTIONS = {
    '.csv': csv2df,
    '.parquet': parquet2df,
    '.zip': zip2df,
    '.mat': mat2df, #TODO
    '.ts': ts2df,
    '.xes': xes2df,
}

#-------------------------------------------------------------------------->>
[docs] def readDataset(data_path, folder=None, file='train.csv', class_col='label', tid_col='tid', missing='?'): """ Reads a dataset file (CSV format by default, 'train.csv') and returns it as a pandas DataFrame. Parameters: ----------- data_path : str The directory path where the dataset file is located. folder : str, optional The subfolder within the data path where the dataset file is located. file : str, optional (default='train.csv') The name of the dataset file to be read. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. missing : str, optional (default='?') The placeholder for missing values in the dataset. Returns: -------- pandas.DataFrame A DataFrame containing the dataset from the specified file, with trajectory identifier, class label, and missing values handled as specified. """ if folder: url = os.path.join(data_path, folder) else: url = data_path ext = Path(url).suffix if '' == ext: url = os.path.join(url, file) ext = Path(url).suffix if '' == ext: url = url + '.csv' url = os.path.abspath(url) ext = Path(url).suffix df = DS_FUNCTIONS[ext](url, class_col=class_col, tid_col=tid_col, missing=missing) return df
[docs] def organizeFrame(df, columns_order=None, tid_col='tid', class_col='label', make_spatials=False): """ Organizes a DataFrame by reordering columns and optionally converting spatial columns. Parameters: ----------- df : pandas.DataFrame The DataFrame to be organized. columns_order : list of str, optional A list of column names specifying the desired order of columns. If None, no reordering is performed. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. make_spatials : bool, optional (default=False) A flag indicating whether to convert spatial columns to both lat/lon separated or space format, which is the lat/lon concatenated in one column. Returns: -------- pandas.DataFrame A DataFrame containing the organized data, with columns added as specified and spatial columns converted if requested. columns_order_zip A list of the columns with space column, if present. columns_order_csv A list of the columns with lat/lon columns, if present. """ if make_spatials and (set(df.columns) & set(['lat', 'lon'])) and not 'space' in df.columns: df.loc[:, 'space'] = df["lat"].astype(str) + ' ' + df["lon"].astype(str) if columns_order is not None: columns_order.insert(columns_order.index('lon')-1, 'space') elif make_spatials and ('space' in df.columns or 'lat_lon' in df.columns) and not (set(df.columns) & set(['lat', 'lon'])): if 'lat_lon' in df.columns: df.rename(columns={'lat_lon': 'space'}, inplace=True) if columns_order is not None: columns_order[columns_order.index('lat_lon')] = 'space' ll = df['space'].str.split(" ", n = 1, expand = True) df["lat"]= ll[0].astype(float) df["lon"]= ll[1].astype(float) if columns_order is not None: columns_order.insert(columns_order.index('space'), 'lat') columns_order.insert(columns_order.index('space')+1, 'lon') # For Columns ordering: if columns_order is None: columns_order = df.columns if class_col and class_col in df.columns: columns_order = [x for x in columns_order if x not in [tid_col, class_col]] columns_order = columns_order + [tid_col, class_col] else: columns_order = [x for x in columns_order if x not in [tid_col]] columns_order = columns_order + [tid_col] columns_order_zip = [x for x in columns_order if x not in ['lat', 'lon']] columns_order_csv = [x for x in columns_order if x not in ['space']] return df, columns_order_zip, columns_order_csv
#-------------------------------------------------------------------------->>
[docs] def trainTestSplit(df, train_size=0.7, random_num=1, tid_col='tid', class_col='label', fileprefix='', \ data_path='.', outformats=[], verbose=False, organize_columns=True): """ Splits a DataFrame into training and testing sets, optionally organizes columns, and saves them to files. Parameters: ----------- df : pandas.DataFrame The DataFrame to be split into training and testing sets. train_size : float, optional (default=0.7) The proportion of the dataset to include in the training set. random_num : int, optional (default=1) The random seed for reproducible results. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. fileprefix : str, optional (default='') The prefix to be added to the file names when saving. data_path : str, optional (default='.') The directory path where the output files will be saved. outformats : list of str, optional A list of output formats for saving the datasets (e.g., ['csv', 'parquet']). verbose : bool, optional (default=False) A flag indicating whether to display progress messages. organize_columns : bool, optional (default=True) A flag indicating whether to organize columns before saving. Returns: -------- train : pandas.DataFrame A DataFrame containing the training set. test : pandas.DataFrame A DataFrame containing the testing set. """ #outformats=['zip', 'csv', 'mat'] if verbose: print(str(train_size)+"% train and test split ... ") if organize_columns: df, columns_order_zip, columns_order_csv = organizeFrame(df, None, tid_col, class_col) else: columns_order_zip = list(df.columns) columns_order_csv = list(df.columns) train_index, test_index, _ = splitTIDs(df, train_size, random_num, tid_col, class_col, min_elements=1) train = df.loc[df[tid_col].isin(train_index)] test = df.loc[df[tid_col].isin(test_index)] # WRITE Train / Test Files for outType in outformats: writeFiles(data_path, fileprefix, train, test, tid_col, class_col, \ columns_order_zip if outType in ['zip', 'mat'] else columns_order_csv, outformat=outType) if verbose: print("Done.") print(" --------------------------------------------------------------------------------") return train, test
[docs] def kfold_trainTestSplit(df, k, random_num=1, tid_col='tid', class_col='label', fileprefix='', columns_order=None, ktrain=None, ktest=None, mat_columns=None, data_path='.', outformats=[], verbose=False): """ Splits a DataFrame into k folds for k-fold cross-validation, optionally organizes columns, and saves them to files. Parameters: ----------- df : pandas.DataFrame The DataFrame to be split into k folds. k : int The number of folds for cross-validation. random_num : int, optional (default=1) The random seed for reproducible results. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. fileprefix : str, optional (default='') The prefix to be added to the file names when saving, for example: 'specific_' or 'generic_'. columns_order : list of str, optional A list of column names specifying the desired order of columns. If None, no reordering is performed. ktrain : list of pandas.DataFrame, optional A list of training sets for each fold. If None, the function will split the data into training and testing sets. ktest : list of pandas.DataFrame, optional A list of testing sets for each fold. If None, the function will split the data into training and testing sets. mat_columns : list of str, optional A list of column names to be included in the .mat files, corresponding to `columns_order`. data_path : str, optional (default='.') The directory path where the output files will be saved. outformats : list of str, optional A list of output formats for saving the datasets (e.g., ['csv', 'zip', 'parquet']). verbose : bool, optional (default=False) A flag indicating whether to display progress messages. Returns: -------- ktrain : list of pandas.DataFrame List of DataFrame containing the training sets. ktest : list of pandas.DataFrame List of DataFrame containing the testing sets. """ if verbose: print(str(k)+"-fold train and test split ... ") df, columns_order_zip, columns_order_csv = organizeFrame(df, columns_order, tid_col, class_col) if not ktrain: ktrain, ktest = splitData(df, k, random_num, tid_col, class_col) elif verbose: print("Train and test data provided.") if len(outformats) > 0: for x in range(k): train_aux = ktrain[x] test_aux = ktest[x] for outType in outformats: if verbose: print("Writing", outType, "files ... " + str(x+1) +'/'+str(k)) path = 'run'+str(x+1) if not os.path.exists(os.path.join(data_path, path)): os.makedirs(os.path.join(data_path, path)) writeFiles(data_path, os.path.join(path, fileprefix), train_aux, test_aux, tid_col, class_col, \ columns_order_zip if outType in ['zip', 'mat'] else columns_order_csv, mat_columns, None, \ outType, opSuff=str(x+1)) if verbose: print("Done.") print(" --------------------------------------------------------------------------------") return ktrain, ktest
[docs] def stratify(df, sample_size=0.5, train_size=0.7, random_num=1, tid_col='tid', class_col='label', organize_columns=True, mat_columns=None, fileprefix='', outformats=[], data_path='.'): """ Stratifies a DataFrame by class label and splits it into training and testing sets, optionally organizes columns, and saves them to files. Parameters: ----------- df : pandas.DataFrame The DataFrame to be stratified and split into training and testing sets. sample_size : float, optional (default=0.5) The proportion of the dataset to sample for stratification. train_size : float, optional (default=0.7) The proportion of the stratified dataset to include in the training set. random_num : int, optional (default=1) The random seed for reproducible results. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. organize_columns : bool, optional (default=True) A flag indicating whether to organize columns before saving. mat_columns : list of str, optional (unused for now) A list of column names to be included in the .mat files, if set to save. fileprefix : str, optional (default='') The prefix to be added to the file names when saving. outformats : list of str, optional A list of output formats for saving the datasets (e.g., ['csv', 'zip', 'parquet']). data_path : str, optional (default='.') The directory path where the output files will be saved. Returns: -------- train : pandas.DataFrame A DataFrame containing the training set. test : pandas.DataFrame A DataFrame containing the testing set. """ train_index, _, _ = splitTIDs(df, sample_size, random_num, tid_col, class_col, min_elements=2) df = df.loc[df[tid_col].isin(train_index)].copy() train_index, test_index, _ = splitTIDs(df, train_size, random_num, tid_col, class_col, min_elements=1) if organize_columns: df, columns_order_zip, columns_order_csv = organizeFrame(df, None, tid_col, class_col) else: columns_order_zip = list(df.columns) columns_order_csv = list(df.columns) train = df.loc[df[tid_col].isin(train_index)] test = df.loc[df[tid_col].isin(test_index)] for outType in outformats: path = 'S'+str(int(sample_size*100)) if not os.path.exists(os.path.join(data_path, path)): os.makedirs(os.path.join(data_path, path)) writeFiles(data_path, os.path.join(path, fileprefix), train, test, tid_col, class_col, \ columns_order_zip if outType in ['zip', 'mat'] else columns_order_csv, mat_columns, None, outType, opSuff=path) return train, test
# TODO fix stratify: def kfold_stratify(df, k=10, inc=1, limit=10, random_num=1, tid_col='tid', class_col='label', fileprefix='', ktrain=None, ktest=None, organize_columns=True, mat_columns=None, data_path='.', outformats=[], ignore_ltk=True): print(str(k)+"-fold stratification of train and test ... ") if organize_columns: df, columns_order_zip, columns_order_csv = organizeFrame(df, None, tid_col, class_col) else: columns_order_zip = list(df.columns) columns_order_csv = list(df.columns) if not ktrain: ktrain, ktest = splitData(df, k, random_num, tid_col, class_col, ignore_ltk=ignore_ltk) else: print("Train and test data provided.") for x in range(0, limit, inc): train_aux = ktrain[0] test_aux = ktest[0] for y in range(1, x+1): train_aux = pd.concat([train_aux, ktrain[y]]) test_aux = pd.concat([test_aux, ktest[y]]) for outType in outformats: path = 'S'+str((x+1)*int(100/k)) if not os.path.exists(os.path.join(data_path, path)): os.makedirs(os.path.join(data_path, path)) writeFiles(data_path, os.path.join(path, fileprefix), train_aux, test_aux, tid_col, class_col, \ columns_order_zip if outType in ['zip', 'mat'] else columns_order_csv, mat_columns, None, outType, opSuff=str(x+1)) print(" Done.") print(" --------------------------------------------------------------------------------") return ktrain, ktest
[docs] def klabels_stratify(df, kl=10, train_size=0.7, random_num=1, tid_col='tid', class_col='label', organize_columns=True, mat_columns=None, fileprefix='', outformats=[], data_path='.'): """ Stratifies a DataFrame by a specified number of class labels and splits it into training and testing sets, optionally organizes columns, and saves them to files. Parameters: ----------- df : pandas.DataFrame The DataFrame to be stratified and split into training and testing sets. kl : int, optional (default=10) The number of class labels to stratify the DataFrame. train_size : float, optional (default=0.7) The proportion of the stratified dataset to include in the training set. random_num : int, optional (default=1) The random seed for reproducible results. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. organize_columns : bool, optional (default=True) A flag indicating whether to organize columns before saving. mat_columns : list of str, optional (unused for now) A list of column names to be included in the .mat files, if set to save. fileprefix : str, optional (default='') The prefix to be added to the file names when saving. outformats : list of str, optional A list of output formats for saving the datasets (e.g., ['csv', 'zip', 'parquet']). data_path : str, optional (default='.') The directory path where the output files will be saved. Returns: -------- train : pandas.DataFrame A DataFrame containing the training set. test : pandas.DataFrame A DataFrame containing the testing set. """ min_elements=1 random.seed(random_num) labels = df[class_col].unique() n = min_elements if kl < min_elements else kl labels_index = random.sample(list(labels), n) df = df.loc[df[class_col].isin(labels_index)].copy() train_index, test_index, _ = splitTIDs(df, train_size, random_num, tid_col, class_col, min_elements=min_elements) if organize_columns: df, columns_order_zip, columns_order_csv = organizeFrame(df, None, tid_col, class_col) else: columns_order_zip = list(df.columns) columns_order_csv = list(df.columns) train = df.loc[df[tid_col].isin(train_index)] test = df.loc[df[tid_col].isin(test_index)] for outType in outformats: path = 'L'+str(n) if not os.path.exists(os.path.join(data_path, path)): os.makedirs(os.path.join(data_path, path)) writeFiles(data_path, os.path.join(path, fileprefix), train, test, tid_col, class_col, \ columns_order_zip if outType in ['zip', 'mat'] else columns_order_csv, mat_columns, None, outType, opSuff=path) return train, test
[docs] def joinTrainTest(dir_path, train_file="train.csv", test_file="test.csv", tid_col='tid', class_col = 'label', to_file=False): """ Joins training and testing datasets from separate files into a single DataFrame. Parameters: ----------- dir_path : str The directory path where the training and testing files are located. train_file : str, optional (default="train.csv") The name of the training file to be read. test_file : str, optional (default="test.csv") The name of the testing file to be read. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. to_file : bool, optional (default=False) A flag indicating whether to save the joined DataFrame to a file, and saves the joined DataFrame to a file named 'joined.csv'. Returns: -------- pandas.DataFrame A DataFrame containing the joined training and testing data. If `to_file` is True, returns the DataFrame and saves the joined DataFrame to a file named 'joined.csv'. """ print("Joining train and test data from... " + dir_path) # Read datasets dataset_train = readDataset(dir_path, None, train_file) dataset_test = readDataset(dir_path, None, test_file) dataset = pd.concat([dataset_train, dataset_test]) dataset.sort_values([class_col, tid_col]) if to_file: print("Saving joined dataset as: " + os.path.join(dir_path, 'joined.csv')) dataset.to_csv(os.path.join(dir_path, 'joined.csv'), index=False) print("Done.") print(" --------------------------------------------------------------------------------") return dataset
#-------------------------------------------------------------------------->> DESCRIPTORS def readDsDesc(data_path, folder=None, file='train.csv', tid_col='tid', class_col='label', missing='?'): # TODO Deprecated df = readDataset(data_path, folder, file, class_col, missing) columns_order = [x for x in df.columns if x not in [tid_col, class_col]] df = df[columns_order + [tid_col, class_col]] if folder == None: folder = os.path.basename(data_path) data_path = os.path.dirname(data_path) return df
[docs] def featuresJSON(df, version=1, deftype='nominal', defcomparator='equals', tid_col='tid', label_col='label', file=False): """ Generates a JSON representation of features from a DataFrame. Parameters: ----------- df : pandas.DataFrame The DataFrame containing the dataset. version : int, optional (default=1) The version number of the JSON schema (1 for MASTERMovelets format, 2 for HiPerMovelets format). deftype : str, optional (default='nominal') The default type of features. defcomparator : str, optional (default='equals') The default comparator for features. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. label_col : str, optional (default='label') The name of the column to be treated as the class/label column. file : bool, optional (default=False) A flag indicating whether to save the JSON representation to a file. Returns: -------- str If `file` is False, returns a str representing the features in JSON format. If `file` is str, returns a str of JSON features and saves the JSON representation to a `file` param name. """ if isinstance(df, list): cols = {x: deftype for x in df} elif isinstance(df, dict): cols = df else: cols = descTypes(df) if tid_col not in cols.keys() or label_col not in cols.keys(): aux = {tid_col: 'numeric', label_col: 'nominal'} cols = {**aux, **cols} if version == 1: s = '{\n "readsDesc": [\n' order = 1 for f, deftype in cols.items(): s += (' {\n "order": '+str(order)+',\n "type": "'+deftype+'",\n "text": "'+f+'"\n }') if len(cols) == order: s += ('\n') else: s += (',\n') order += 1 s += (' ],\n "pointFeaturesDesc": [],\n "subtrajectoryFeaturesDesc": [],\n') s += (' "trajectoryFeaturesDesc": [],\n "pointComparisonDesc": {\n "pointDistance": "euclidean",\n') s += (' "featureComparisonDesc": [\n') order = 1 for f, deftype in cols.items(): if f != tid_col and f != label_col: s += (' {\n "distance": "'+defcomparator+'",\n "maxValue": -1,\n "text": "'+f+'"\n }') if len(cols)-1 == order: s += ('\n') else: s += (',\n') order += 1 s += (' ]\n },\n "subtrajectoryComparisonDesc": {\n "subtrajectoryDistance": "euclidean",\n') s += (' "featureComparisonDesc": [\n {\n "distance": "euclidean",\n "text": "points"\n') s += (' }\n ]\n }\n}') else: # VERSION 2 (*_hp.json) s = '{\n "input": {\n "train": ["train"],\n "test": ["test"],\n "format": "CSV",\n' s += ' "loader": "interning"\n },\n' s += ' "idFeature": {\n "order": '+str(list(cols.keys()).index(tid_col)+1)+',\n "type": "numeric",\n "text": "'+tid_col+'"\n },\n' s += ' "labelFeature": {\n "order": '+str(list(cols.keys()).index(label_col)+1)+',\n "type": "nominal",\n "text": "label"\n },\n' s += ' "attributes": [\n' order = 1 for f, deftype in cols.items(): if f != tid_col and f != label_col: s += ' {\n "order": '+str(order)+',\n "type": "'+deftype+'",\n "text": "'+str(f)+'",\n "comparator": {\n "distance": "'+defcomparator+'"\n }\n }' if len(cols)-1 == order: s += ('\n') else: s += (',\n') order += 1 s += ' ]\n}' if file: file = open(file, 'w') print(s, file=file) file.close() else: print(s)
#-------------------------------------------------------------------------->> STATISTICS
[docs] def countClasses(data_path, folder, file='train.csv', tid_col = 'tid', class_col = 'label', markd=False): """ Counts the occurrences of each class label in a dataset. Parameters: ----------- data_path : str The directory path where the dataset file is located. folder : str The subfolder within the data path where the dataset file is located. file : str, optional (default='train.csv') The name of the dataset file to be read. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. markd : bool, optional (default=False) A flag indicating whether to print the class counts in Markdown format. Returns: -------- pandas.DataFrame or str If `markd` is False, prins the markdown text and returns a dictionary DataFrame containing the counts of each class label in the dataset. If `markd` is True, returns str markdown of the counts of each class label in the dataset. """ df = readDataset(data_path, folder, file, class_col, tid_col, markd) return countClasses_df(df, tid_col, class_col, markd)
def countClasses_df(df, tid_col = 'tid', class_col = 'label', markd=False): group = df.groupby([class_col, tid_col]) df2 = group.apply(lambda x: ', '.join([str(s) for s in list(x[class_col].unique())])) md = "Number of Samples: " + str(len(df[tid_col].unique())) md += '\n\r' md += "Samples by Class:" md += '\n\r' if markd: md += '\n\r' md += df2.value_counts().to_markdown(tablefmt="github", headers=["Label", "#"]) return md else: print(md) print(df2.value_counts()) return df2.value_counts()
[docs] def dfVariance(df): """ Computes the variance for each column in a DataFrame. Parameters: ----------- df : pandas.DataFrame The DataFrame for which variance is to be computed. Returns: -------- pandas.Series A Series containing the variance for each column in the DataFrame. """ stats=pd.DataFrame() dfx = df.apply(pd.to_numeric, args=['coerce']) #stats["Mean"]=dfx.mean(axis=0, skipna=True) #stats["Std.Dev"]=dfx.std(axis=0, skipna=True) stats["Variance"]=dfx.var(axis=0, skipna=True) dfx = df.fillna('?') for col in df.columns: if not np.issubdtype(dfx[col].dtype, np.number): categories = list(dfx[col].unique()) dfx[col] = pd.Categorical(dfx[col], categories, ordered=True) #stats["Mean"][col] = categories[int( np.median(dfx[col].cat.codes) )] #stats["Std.Dev"][col] = np.std(dfx[col].cat.codes) stats["Variance"][col] = np.var(dfx[col].cat.codes) return stats.sort_values('Variance', ascending=False)
[docs] def dfStats(df): """ Computes summary statistics for each column in a DataFrame. Parameters: ----------- df : pandas.DataFrame The DataFrame for which statistics are to be computed. Returns: -------- pandas.DataFrame A DataFrame containing summary statistics for each column, including mean, standard deviation, and variance. Columns are sorted by variance in descending order. """ stats=pd.DataFrame() dfx = df.apply(pd.to_numeric, args=['coerce']) stats["Mean"]=dfx.mean(axis=0, skipna=True) stats["Std.Dev"]=dfx.std(axis=0, skipna=True) stats["Variance"]=dfx.var(axis=0, skipna=True) dfx = df.fillna('?') for col in df.columns: if not np.issubdtype(dfx[col].dtype, np.number): categories = list(dfx[col].unique()) dfx[col] = pd.Categorical(dfx[col], categories, ordered=True) stats["Mean"][col] = categories[int( np.median(dfx[col].cat.codes) )] stats["Std.Dev"][col] = np.std(dfx[col].cat.codes) stats["Variance"][col] = np.var(dfx[col].cat.codes) return stats.sort_values('Variance', ascending=False)
[docs] def datasetStatistics(data_path, folder, file_prefix='', tid_col = 'tid', class_col = 'label', to_file=False): """ Computes statistics for a dataset, including summary statistics for each column and class distribution into a markdown file format. Parameters: ----------- data_path : str The directory path where the dataset file(s) are located. folder : str The subfolder within the data path where the dataset file(s) are located. file_prefix : str, optional (default='') The prefix to be added to the dataset file names. tid_col : str, optional (default='tid') The name of the column to be used as the trajectory identifier. class_col : str, optional (default='label') The name of the column to be treated as the class/label column. to_file : bool, optional (default=False) A flag indicating whether to save the statistics to a file. Returns: -------- dict or None If `to_file` is False, prints markdown and returns a str containing the computed statistics. If `to_file` is str, returns markdown str and saves the statistics to a file named as in `to_file` value. """ def addLine(i): return '\n\r' + ('&nbsp;'.join(['\n\r' for x in range(i)])) + '\n\r' train = readDsDesc(data_path, folder, file_prefix+'train.csv', tid_col, class_col, missing='NaN') test = readDsDesc(data_path, folder, file_prefix+'test.csv', tid_col, class_col, missing='NaN') md = '##### Descriptive Statistics for ' + folder sam_train = len(train.tid.unique()) sam_test = len(test.tid.unique()) points = len(train) + len(test) samples = sam_train + sam_test top_train = train.groupby(['tid']).count().sort_values('label').tail(1)['label'].iloc[0] bot_train = train.groupby(['tid']).count().sort_values('label').head(1)['label'].iloc[0] top_test = test.groupby(['tid']).count().sort_values('label').tail(1)['label'].iloc[0] bot_test = test.groupby(['tid']).count().sort_values('label').head(1)['label'].iloc[0] classes = train[class_col].unique() avg_size = points / samples diff_size = max( avg_size - min(bot_train, bot_test) , max(top_train, top_test) - avg_size ) stats_df = pd.DataFrame({ 'Number of Classes': [len(classes), '-', '-'], 'Number of Attributes': [len(train.columns), '-', '-'], 'Avg Size of Trajs': ['{:.2f}'.format(avg_size) + ' / ±' + str(diff_size), '-', '-'], 'Number of Trajs': [str(samples), str(sam_train), str(sam_test)], 'Hold-out': ['100%', '{:.2f}%'.format(sam_train*100/samples), '{:.2f}%'.format(sam_test*100/samples)], 'Number of Points': [str(points), str(len(train)), str(len(test))], 'Longest Size': [str(max(top_train, top_test)), str(top_train), str(top_test)], 'Shortest Size': [str(max(bot_train, bot_test)), str(bot_train), str(bot_test)], }, index=['Total', 'Train', 'Test']) md += addLine(1) md += stats_df.to_markdown(tablefmt="github", colalign="right") md += addLine(2) # print('\n--------------------------------------------------------------------') md += '###### Attributes: ' md += ', '.join([str(x) for x in train.columns]) md += addLine(1) md += '###### Labels: ' # md += addLine(1) md += ', '.join([str(x) for x in classes]) md += addLine(2) # md += addLine(2) df = pd.concat([train, test]) df.drop(['tid'], axis=1, inplace=True) stats = df.describe(include='all').fillna('') md += stats.to_markdown(tablefmt="github") md += addLine(2) md += 'Descriptive Statistics (by Variance): ' md += addLine(1) #stats=pd.DataFrame() #dfx = df.apply(pd.to_numeric, args=['coerce']) #stats["Mean"]=dfx.mean(axis=0, skipna=True) #stats["Std.Dev"]=dfx.std(axis=0, skipna=True) #stats["Variance"]=dfx.var(axis=0, skipna=True) # #df.fillna('?', inplace=True) #for col in df.columns: # if not np.issubdtype(df[col].dtype, np.number): # categories = list(df[col].unique()) # df[col] = pd.Categorical(df[col], categories, ordered=True) # stats["Mean"][col] = categories[int( np.median(df[col].cat.codes) )] # stats["Std.Dev"][col] = np.std(df[col].cat.codes) # stats["Variance"][col] = np.var(df[col].cat.codes) md += dfStats(df).to_markdown(tablefmt="github") #md += stats.sort_values('Variance', ascending=False).to_markdown(tablefmt="github") md += addLine(2) if len(classes) < 15: # print('\n--------------------------------------------------------------------') md += '###### Labels for TRAIN:' md += addLine(1) md += countClasses_df(train, markd=True) md += addLine(2) # md += train.describe().to_markdown(tablefmt="github") # md += addLine(2) # print('\n--------------------------------------------------------------------') md += '###### Labels for TEST.:' md += addLine(1) md += countClasses_df(test, markd=True) # md += addLine(2) # md += test.describe().to_markdown(tablefmt="github") # md += addLine(2) if to_file: f = open(to_file, "w") f.write(f''+md) f.close() else: print('\n--------------------------------------------------------------------') print(md) return md
#-------------------------------------------------------------------------->> HELPERS def splitTIDs(df, train_size=0.7, random_num=1, tid_col='tid', class_col='label', min_elements=1): train = list() test = list() df_ = df.groupby(tid_col).first().reset_index()[[tid_col, class_col]] def splitByLabel(label): nonlocal df_, train, test tids = df_.loc[df_[class_col] == label][tid_col].unique() random.seed(random_num) n = int(float(len(tids))*train_size) n = min_elements if n < min_elements else n train_index = random.sample(list(tids), n) test_index = tids[np.isin(tids, train_index, invert=True)] train = train + list(train_index) test = test + list(test_index) list(map(lambda label: splitByLabel(label), tqdm(df_[class_col].unique()))) return train, test, df_ def splitData(df, k, random_num, tid_col='tid', class_col='label', opLabel='Spliting Data', ignore_ltk=True): if ignore_ltk: # removes labels with less than k trajectories... df = dropLabelsltk(df, k, tid_col, class_col) ktrain = [] ktest = [] for x in range(k): ktrain.append( pd.DataFrame() ) ktest.append( pd.DataFrame() ) kfold = KFold(n_splits=k, shuffle=True, random_state=random_num) def addData(label): tids = df.loc[df[class_col] == label][tid_col].unique() x = 0 for train_idx, test_idx in kfold.split(tids): ktrain[x] = pd.concat([ktrain[x], df.loc[df[tid_col].isin(tids[train_idx])]]) ktest[x] = pd.concat([ktest[x], df.loc[df[tid_col].isin(tids[test_idx])]]) x += 1 list(map(lambda label: addData(label), tqdm(df[class_col].unique(), desc=opLabel))) return ktrain, ktest def dropLabelsltk(df, k, tid_col='tid', class_col='label'): df_ = df.groupby(by=class_col, as_index=False).agg({tid_col: pd.Series.nunique}) index_names = df[df[class_col].isin(df_[df_[tid_col] < k][class_col])].index return df.drop(index_names) def labels_extract(df, labels=[], tid_col='tid', class_col='label', organize_columns=True): df = df.loc[df[class_col].isin(labels)].copy() if organize_columns: df, columns_order_zip, columns_order_csv = organizeFrame(df, None, tid_col, class_col) else: columns_order_zip = list(df.columns) columns_order_csv = list(df.columns) return df def writeFile(data_path, df, file, tid_col, class_col, columns_order, mat_columns=None, desc_cols=None, outformat='zip', opSuff=''): if outformat == 'zip': # WRITE ZIP >> FOR MASTERMovelets: df2zip(data_path, df, file, tid_col, class_col, select_cols=columns_order,\ opLabel='Writing - ZIP |' + opSuff) elif outformat == 'csv': print('Writing - CSV |' + opSuff) df[columns_order].to_csv(os.path.join(data_path, file+".csv"), index = False) elif outformat == 'parquet': print('Writing - Parquet |' + opSuff) df[columns_order].to_parquet(os.path.join(data_path, file+".parquet"), index = False) elif outformat == 'mat': # WRITE MAT Files >> FOR HiPerMovelets: df2mat(df, data_path, file, cols=columns_order, mat_cols=mat_columns, tid_col=tid_col, class_col=class_col, \ desc_cols=desc_cols, opLabel='Writing - MAT|' + opSuff) def writeFiles(data_path, file, train, test, tid_col, class_col, columns_order, mat_columns=None, desc_cols=None, outformat='zip', opSuff=''): # WRITE Train writeFile(data_path, train, file+'train', tid_col, class_col, columns_order, mat_columns, desc_cols, outformat, opSuff='TRAIN - '+opSuff) # WRITE Test writeFile(data_path, test, file+'test', tid_col, class_col, columns_order, mat_columns, desc_cols, outformat, opSuff='TEST - '+ opSuff) #-------------------------------------------------------------------------->> def splitframe(data, name='tid'): n = data[name][0] df = pd.DataFrame(columns=data.columns) datalist = [] for i in range(len(data)): if data[name][i] == n: df = df.append(data.iloc[i]) else: datalist.append(df) df = pd.DataFrame(columns=data.columns) n = data[name][i] df = df.append(data.iloc[i]) return datalist #-------------------------------------------------------------------------------- def convertDataset(dir_path, k=None, cols = None, fileprefix='', tid_col='tid', class_col='label'): def convert_file(file, cols): df = readDataset(dir_path, fileprefix+file+'.csv') if not cols: cols = list(df.columns) df, columns_order_zip, columns_order_csv = organizeFrame(df, cols, tid_col, class_col) outformats = [] if not os.path.exists(os.path.join(dir_path, file+'.zip')): outformats.append('zip') # print("Saving dataset as: " + os.path.join(dir_path, file+'.zip')) # df2zip(dir_path, df, file, tid_col, class_col, select_cols=columns_order_zip) if not os.path.exists(os.path.join(dir_path, fileprefix+file+'.csv')): outformats.append('csv') # print("Saving dataset as: " + os.path.join(dir_path, file+'.csv')) # df[columns_order_csv].to_csv(os.path.join(dir_path, fileprefix+file+'.csv'), index = False) if not os.path.exists(os.path.join(dir_path, fileprefix+file+'.mat')): outformats.append('mat') # print("Saving dataset as: " + os.path.join(dir_path, file+'.mat')) # df[columns_order_csv].to_csv(os.path.join(dir_path, fileprefix+file+'.mat'), index = False) return df, columns_order_zip, columns_order_csv, outformats df_test, columns_order_zip, columns_order_csv, outformats = convert_file('test', cols) df_train, columns_order_zip, columns_order_csv, outformats = convert_file('train', cols) for outType in outformats: writeFiles(dir_path, fileprefix, df_train, df_test, tid_col, class_col, \ columns_order_zip if outType in ['zip', 'mat'] else columns_order_csv, None, outType, opSuff='') data = pd.concat([df_train,df_test]) if k and not os.path.exists(os.path.join(dir_path, 'run1')): train, test = kfold_trainTestSplit(data, k, fileprefix=fileprefix, random_num=1, tid_col=tid_col, class_col=class_col, columns_order=columns_order_csv, data_path=dir_path) for i in range(1, k+1): for file in ['train', 'test']: os.rename(os.path.join(dir_path, 'run'+str(i), fileprefix+file+'.zip'), os.path.join(dir_path, 'run'+str(i), file+'.zip')) if 'space' in columns_order_zip: kfold_trainTestSplit(None, k, random_num=1, fileprefix='raw_', tid_col=tid_col, class_col=class_col, columns_order=columns_order_csv, ktrain=train, ktest=test, data_path=dir_path) for i in range(1, k+1): for file in ['train', 'test']: os.remove(os.path.join(dir_path, 'run'+str(i), 'raw_'+file+'.zip')) print("All Done.") #--------------------------------------------------------------------------------