# -*- coding: utf-8 -*-
"""
**Multiple Aspect Trajectory Tools Framework**
*MAT-data: Data Preprocessing for Multiple Aspect Trajectory Data Mining*
The present application offers a tool, to support the user in the classification task of multiple aspect trajectories,
specifically for extracting and visualizing the movelets, the parts of the trajectory that better discriminate a class.
It integrates into a unique platform the fragmented approaches available for multiple aspects trajectories and in
general for multidimensional sequence classification into a unique web-based and python library system. Offers both
movelets visualization and classification methods.
Created on Dec, 2023
Copyright (C) 2023, License GPL Version 3 or superior (see LICENSE file)
@author: Tarlis Portela
----
"""
import os
import pandas as pd
import numpy as np
import requests
import subprocess
import tempfile, py7zr
from tqdm.auto import tqdm
from matdata.preprocess import organizeFrame, splitTIDs, readDataset, trainTestSplit, kfold_trainTestSplit
# Repository data on GitHub
USER = "mat-analysis"
REPOSITORY = "datasets"
# The 2 URLs are a workaround of GH delay in accessing raw files
REPO_URL_API = 'https://api.github.com/repos/{}/{}/contents/{}/{}/'
REPO_URL_RAW = 'https://raw.githubusercontent.com/{}/{}/main/{}/{}/'
DATASET_TYPES = {
'mat': 'Multiple Aspect Trajectories',
'raw': 'Raw Trajectories',
'sequential': 'Sequential Semantics',
'log': 'Event Logs',
'mts': 'Multivariate Time Series',
'uts': 'Univariate Time Series',
}
SUBSET_TYPES = {
'*.specific': 'Multiple',
'mat.specific': 'Multiple Aspect',
'raw.specific': 'Raw',
'sequential.*': 'Semantic',
'mts.specific': 'Multivariate',
'uts.specific': 'Univariate',
'log.specific': 'Event Log',
'log.process': 'Event Log', #Deprecated?
'log.*': 'Semantic',
'*.raw': 'Spatio-Temporal',
'*.spatial': 'Spatial',
'*.geo_only': 'Spatial',
'*.generic': 'Generic',
'*.category': 'Category',
'*.poi': 'POI',
'*.5dims': '5-Dimensions',
'*.genes': 'Genetic Sequence',
}
###############################################################################
# LOAD DATASETs - From https://github.com/mat-analysis/datasets/
###############################################################################
[docs]
def prepare_ds(df, tid_col='tid', class_col=None, sample_size=1, random_num=1):
"""
Prepare dataset for training or testing (helper function).
Parameters:
-----------
df : pandas.DataFrame
The DataFrame containing the dataset.
tid_col : str, optional
The name of the column representing trajectory IDs (default 'tid').
class_col : str or None, optional
The name of the column representing class labels. If None, no class column is used for ordering data (default None).
sample_size : float, optional
The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
random_num : int, optional
Random seed for reproducibility (default 1).
Returns:
--------
pandas.DataFrame
The prepared dataset with optional sampling.
"""
if class_col:
df.rename(columns={tid_col: 'tid', class_col: 'label'}, inplace=True)
class_col = 'label'
df.sort_values(['label', 'tid'])
else:
df.rename(columns={tid_col: 'tid'}, inplace=True)
df.sort_values(['tid'])
if sample_size < 1: # Stratify the data
df_index, _, _ = splitTIDs(df, sample_size, random_num, 'tid', class_col, min_elements=2)
df = df.loc[df['tid'].isin(df_index)]
df, _, columns_order_csv = organizeFrame(df, None, 'tid', class_col)
return df[columns_order_csv]
# ------------------------------------------------------------
## TODO: For now, all datasets on repository have tid and label columns. This can change in the future.
[docs]
def load_ds(dataset='mat.FoursquareNYC', prefix='', missing='-999', sample_size=1, random_num=1):
"""
Load a dataset for training or testing from a GitHub repository.
Parameters:
-----------
dataset : str, optional
The name of the dataset to load (default 'mat.FoursquareNYC').
prefix : str, optional
The prefix to be added to the dataset file name (default '').
missing : str, optional
The placeholder value used to denote missing data (default '-999').
sample_size : float, optional
The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
random_num : int, optional
Random seed for reproducibility (default 1).
Returns:
--------
pandas.DataFrame
The loaded dataset with optional sampling.
"""
def is_file(dsc, dsn, file):
url = REPO_URL_API.format(USER, REPOSITORY, dsc, dsn) + file
try:
resp = requests.head(url)
# return resp.status_code == requests.codes.found
return resp.status_code == requests.codes.ok
except Exception as e:
return False
def url_is_file(url):
try:
resp = requests.head(url)
return resp.status_code == requests.codes.found
# return resp.status_code == requests.codes.ok
except Exception as e:
return False
def download(url, tmpdir):
file = os.path.join(tmpdir, os.path.basename(url))
subprocess.run('curl -o {} {}'.format(file, url), shell=True, check=True)
# response = requests.get(url, stream=True)
# with open(os.path.join(tmpdir, os.path.basename(url)), 'wb') as out:
# out.write(response.content)
# #content = response.json()['content']
# #out.write(base64.b64decode(content))
# return True
return True #False
def read(url):
df = pd.read_parquet(url)
df.fillna(missing, inplace=True)
return prepare_ds(df, tid_col='tid', class_col='label', sample_size=sample_size, random_num=random_num)
# ------
file = 'data.parquet'
if prefix and prefix != '':
file = prefix+'_data.parquet'
dsc = dataset.split('.')[0]
dsn = dataset.split('.')[1]
base = REPO_URL_RAW.format(USER, REPOSITORY, dsc, dsn)
# Try to load: 'data.parquet'
url = base + file
if is_file(dsc, dsn, file): # url_is_file(url):
print("Loading dataset file: " + base)
# return read(url)
with tempfile.TemporaryDirectory() as tmpdir:
download(url, tmpdir)
return read(os.path.join(tmpdir, file))
# Try to load compressed: 'data.parquet.7z'
url = base + file +'.7z'
if is_file(dsc, dsn, file+'.7z'): #url_is_file(url):
print("Loading dataset compressed file: " + base)
with tempfile.TemporaryDirectory() as tmpdir:
download(url, tmpdir)
filename = os.path.join(tmpdir, file +'.7z')
with py7zr.SevenZipFile(filename, 'r') as archive:
archive.extractall(path=tmpdir)
print("Done.")
print(" --------------------------------------------------------------------------------")
return read(os.path.join(tmpdir, file))
# Try to load compressed and splitted: 'data.parquet.7z.001-N'
if is_file(dsc, dsn, file+'.001'): #url_is_file(url+'.001'):
print("Loading dataset multi-volume files: " + base)
with tempfile.TemporaryDirectory() as tmpdir:
with open(os.path.join(tmpdir, file +'.7z'), 'ab') as outfile: # append in binary mode
i = 1
while is_file(dsc, dsn, file+'.{:03d}'.format(i)) and download(url+'.{:03d}'.format(i), tmpdir):
with open(os.path.join(tmpdir, file+'.7z.{:03d}'.format(i)) , 'rb') as infile: # open in binary mode also
outfile.write(infile.read())
i += 1
filename = os.path.join(tmpdir, file +'.7z')
with py7zr.SevenZipFile(filename, 'r') as archive:
archive.extractall(path=tmpdir)
print("Done.")
print(" --------------------------------------------------------------------------------")
return read(os.path.join(tmpdir, file))
raise Exception('Unable to load file, check the repository: ' + base)
[docs]
def load_ds_holdout(dataset='mat.FoursquareNYC', train_size=0.7, prefix='', missing='-999', sample_size=1, random_num=1):
"""
Load a dataset for training and testing with a holdout method from a GitHub repository.
Parameters:
-----------
dataset : str, optional
The name of the dataset file to load from the GitHub repository (default 'mat.FoursquareNYC'). Format as `category.DatasetName`
train_size : float, optional
The proportion of the dataset to include in the training set (default 0.7).
prefix : str, optional
The prefix to be added to the dataset file name (default '').
missing : str, optional
The placeholder value used to denote missing data (default '-999').
sample_size : float, optional
The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
random_num : int, optional
Random seed for reproducibility (default 1).
Returns:
--------
train : pandas.DataFrame
The training dataset.
test : pandas.DataFrame
The testing dataset.
"""
df = load_ds(dataset, prefix, missing, sample_size, random_num)
# Class balanced train/ test split:
train, test = trainTestSplit(df, train_size, random_num)
return train, test
[docs]
def load_ds_kfold(dataset='mat.FoursquareNYC', k=5, prefix='', missing='-999', sample_size=1, random_num=1):
"""
Load a dataset for k-fold cross-validation from a GitHub repository.
Parameters:
-----------
dataset : str, optional
The name of the dataset file to load from the GitHub repository (default 'mat.FoursquareNYC').
k : int, optional
The number of folds for cross-validation (default 5).
prefix : str, optional
The prefix to be added to the dataset file name (default '').
missing : str, optional
The placeholder value used to denote missing data (default '-999').
sample_size : float, optional
The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
random_num : int, optional
Random seed for reproducibility (default 1).
Returns:
--------
ktrain : list ofpandas.DataFrame
The training datasets for each fold.
ktest : list of pandas.DataFrame
The testing datasets for each fold.
"""
df = load_ds(dataset, prefix, missing, sample_size, random_num)
# Class balanced f-fold train/ test split:
ktrain, ktest = kfold_trainTestSplit(df, k, random_num)
return ktrain, ktest
# ------------------------------------------------------------
[docs]
def repository_datasets():
"""
Read the datasets available in the repository and organize them by category.
Returns:
--------
dict
A dictionary containing lists of datasets, where each category is a key.
"""
import requests
url = "https://api.github.com/repos/{}/{}/git/trees/main?recursive=1".format(USER, REPOSITORY)
r = requests.get(url)
res = r.json()
files = list(map(lambda file: file["path"], res["tree"]))
datasets_dict = {}
def create_dict(file):
if file[-3:] == '.md' and '-stats.md' not in file and 'README' not in file and 'TODO' not in file:
file = file.split(os.path.sep)
category = file[0]
if category not in datasets_dict.keys():
datasets_dict[category] = []
name = file[-1].split('.')[0]
datasets_dict[category].append(name)
return file
file = list(map(lambda file: create_dict(file), files))
return datasets_dict
###############################################################################
# READ DATASETs - From local files
###############################################################################
[docs]
def read_ds(data_file, tid_col='tid', class_col=None, missing='-999', sample_size=1, random_num=1):
"""
Read a dataset from a file.
Parameters:
-----------
data_file : str
The path to the dataset file.
tid_col : str, optional
The name of the column representing trajectory IDs (default 'tid').
class_col : str or None, optional
The name of the column representing class labels. If None, no class column is used (default None).
missing : str, optional
The placeholder value used to denote missing data (default '-999').
sample_size : float, optional
The proportion of the dataset to include in the sample (default 1, i.e., use the entire dataset).
random_num : int, optional
Random seed for reproducibility (default 1).
Returns:
--------
pandas.DataFrame
The read dataset.
"""
df = readDataset(data_file, class_col=class_col, tid_col=tid_col, missing=missing)
return prepare_ds(df, tid_col, class_col, sample_size, random_num)
[docs]
def read_ds_5fold(data_path, prefix='specific', suffix='.csv', tid_col='tid', class_col=None, missing='-999'):
"""
Read datasets for k-fold cross-validation from files in a directory.
See Also
--------
read_ds_kfold : Read datasets for k-fold cross-validation.
Parameters:
-----------
data_path : str
The path to the directory containing the dataset files.
prefix : str, optional
The prefix of the dataset file names (default 'specific').
suffix : str, optional
The suffix of the dataset file names (default '.csv').
tid_col : str, optional
The name of the column representing trajectory IDs (default 'tid').
class_col : str or None, optional
The name of the column representing class labels. If None, no class column is used (default None).
missing : str, optional
The placeholder value used to denote missing data (default '-999').
Returns:
--------
5_train : list ofpandas.DataFrame
The training datasets for each fold.
5_test : list of pandas.DataFrame
The testing datasets for each fold.
"""
return read_ds_kfold(data_path, 5, prefix, suffix, tid_col, class_col, missing)
[docs]
def read_ds_kfold(data_path, k=5, prefix='specific', suffix='.csv', tid_col='tid', class_col=None, missing='-999'):
"""
Read datasets for k-fold cross-validation from files in a directory.
Parameters:
-----------
data_path : str
The path to the directory containing the dataset files.
k : int, optional
The number of folds for cross-validation (default 5).
prefix : str, optional
The prefix of the dataset file names (default 'specific').
suffix : str, optional
The suffix of the dataset file names (default '.csv').
tid_col : str, optional
The name of the column representing trajectory IDs (default 'tid').
class_col : str or None, optional
The name of the column representing class labels. If None, no class column is used (default None).
missing : str, optional
The placeholder value used to denote missing data (default '-999').
Returns:
--------
ktrain : list ofpandas.DataFrame
The training datasets for each fold.
ktest : list of pandas.DataFrame
The testing datasets for each fold.
"""
dsc = data_path.split(os.path.sep)[-2]
dsn = data_path.split(os.path.sep)[-1]
k_train = []
k_test = []
for fold in tqdm(range(1, k+1), desc='Reading '+str(k)+'-fold dataset '+ dsn + ' of ' + translateCategory(dsn, dsc)):
df_train, df_test = read_ds_holdout(data_path, prefix, suffix, tid_col, class_col, missing, fold)
k_train.append(df_train)
k_test.append(df_test)
return k_train, k_test
[docs]
def read_ds_holdout(data_path, prefix='specific', suffix='.csv', tid_col='tid', class_col=None, missing='-999', fold=None):
"""
Read datasets for holdout validation from files in a directory.
Parameters:
-----------
data_path : str
The path to the directory containing the dataset files.
prefix : str, optional
The prefix of the dataset file names (default 'specific').
suffix : str, optional
The suffix of the dataset file names (default '.csv').
tid_col : str, optional
The name of the column representing trajectory IDs (default 'tid').
class_col : str or None, optional
The name of the column representing class labels. If None, no class column is used (default None).
missing : str, optional
The placeholder value used to denote missing data (default '-999').
fold : int or None, optional
The fold number to load for holdout validation, including subdirectory (ex. run1). If None, read files in `data_path`.
Returns:
--------
train : pandas.DataFrame
The training dataset.
test : pandas.DataFrame
The testing dataset.
"""
dsc = data_path.split(os.path.sep)[-2]
dsn = data_path.split(os.path.sep)[-1]
if prefix and prefix != '':
files = [prefix+'_train.csv', prefix+'_test'+suffix]
else:
files = ['train'+suffix, 'test'+suffix]
if fold:
files = [os.path.join('run'+str(fold), files[0]), os.path.join('run'+str(fold), files[1])]
else:
print('Reading dataset', dsn, 'of', translateCategory(dsn, dsc))
dataset = []
for file in tqdm(files, desc=dsn + ' (' + translateCategory(dsn, dsc) + \
('), fold: '+str(fold) if fold else ')')):
# url = BASE_URL + dsc+'/'+dsn+'/' + file
url = os.path.join(data_path, file)
df = read_ds(url, tid_col, class_col, missing)
dataset.append(df)
return dataset
# ------------------------------------------------------------
def translateDesc(dataset, category, descName):
dst, dsn = descName.split('.')[0].split('_')[0:2]
if dsn in ['allfeat', '5dims']:
return False
if getDescName(category, dataset) == dst:
return dsn
elif dataset in dst:
return dsn
return False
def translateCategory(dataset, category, descName=None):
if descName:
if (category+'.'+descName) in SUBSET_TYPES.keys():
return SUBSET_TYPES[category+'.'+descName]
elif ('*.'+descName) in SUBSET_TYPES.keys():
return SUBSET_TYPES['*.'+descName]
elif (category+'.*') in SUBSET_TYPES.keys():
return SUBSET_TYPES[category+'.*']
else:
return descName.capitalize()
elif category in DATASET_TYPES.keys():
return DATASET_TYPES[category]
else:
return category.split('_')[0].title()
# ------------------------------------------------------------
#def getName(dic, dst=None, dsn=None):
# dst = (dst if dst else '*')
# dsn = (dsn if dsn else '*')
# if dst +'.'+ dsn in dic.keys():
# name = dic[dst +'.'+ dsn]
# elif dst +'.*' in dic.keys():
# name = dic[dst +'.*']
# elif '*.*' in dic.keys():
# name = dic['*.*']
#
# if not name:
# name = dsn
# return name
#
#def getDescName(dst, dsn):
# name = getName(DESCRIPTOR_NAMES, dst, dsn)
# if not name:
# name = dsn
# return name
#
#def getFeature(dst, dsn):
# name = getName(FEATURES_NAMES, dst, dsn)
# if not name:
# name = ['poi']
# return name
#
#def getSubset(dsn, feature):
# for key, value in FEATURES_NAMES.items():
# if dsn in key and feature in value:
# if '?' in key:
# return 'generic'
#
# return 'specific'