# -*- coding: utf-8 -*-
"""
**Multiple Aspect Trajectory Tools Framework**
*MAT-data: Data Preprocessing for Multiple Aspect Trajectory Data Mining*
The present application offers a tool, to support the user in the classification task of multiple aspect trajectories,
specifically for extracting and visualizing the movelets, the parts of the trajectory that better discriminate a class.
It integrates into a unique platform the fragmented approaches available for multiple aspects trajectories and in
general for multidimensional sequence classification into a unique web-based and python library system. Offers both
movelets visualization and classification methods.
Created on Dec, 2023
Copyright (C) 2023, License GPL Version 3 or superior (see LICENSE file)
@author: Tarlis Portela
----
"""
import os
import pandas as pd
import numpy as np
from zipfile import ZipFile
from tqdm.auto import tqdm
# IN METHOD, ts2df: #from matdata.inc.ts_io import load_from_tsfile_to_dataframe
#-------------------------------------------------------------------------->>
[docs]
def csv2df(url, class_col='label', tid_col='tid', missing='?'): # TODO class_col, tid_col unnecessary
"""
Converts a CSV file from a given URL into a pandas DataFrame.
Parameters:
-----------
url : str
The URL pointing to the CSV file to be read.
class_col : str, optional (default='label')
Unused, kept for standard.
tid_col : str, optional (default='tid')
Unused, kept for standard.
missing : str, optional (default='?')
The placeholder for missing values in the CSV file.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the CSV file, with missing values
handled as specified and columns renamed if necessary.
"""
return pd.read_csv(url, na_values=missing)
[docs]
def parquet2df(url, class_col='label', tid_col='tid', missing='?'): # TODO class_col, tid_col unnecessary
"""
Converts a Parquet file from a given URL into a pandas DataFrame.
Parameters:
-----------
url : str
The URL pointing to the Parquet file to be read.
class_col : str, optional (default='label')
Unused, kept for standard.
tid_col : str, optional (default='tid')
Unused, kept for standard.
missing : str, optional (default='?')
The placeholder for missing values in the dataset.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the Parquet file, with missing values
handled as specified and columns renamed if necessary.
"""
df = pd.read_parquet(url)
if missing:
df = df.fillna(missing)
return df
[docs]
def zip2df(url, class_col='label', tid_col='tid', missing='?', opLabel='Reading ZIP'):
"""
Extracts and converts a CSV trajectory file from a ZIP archive located at a given URL into a pandas DataFrame.
Parameters:
-----------
url : str
The URL pointing to the ZIP archive containing the CSV file to be read.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
tid_col : str, optional (default='tid')
The name of the column to be used as the unique trajectory identifier.
missing : str, optional (default='?')
The placeholder for missing values in the CSV file.
opLabel : str, optional (default='Reading ZIP')
A label describing the operation, for logging purposes.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the extracted CSV file, with missing values
handled as specified and columns renamed if necessary.
"""
if isinstance(url, str):
url = ZipFile(url)
return read_zip(url, None, class_col, tid_col, missing, opLabel)
[docs]
def mat2df(url, class_col='label', tid_col='tid', missing='?'):
"""
Converts a MATLAB .mat file from a given URL into a pandas DataFrame.
Parameters:
-----------
url : str
The URL pointing to the .mat file to be read.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
tid_col : str, optional (default='tid')
The name of the column to be used as the unique trajectory identifier.
missing : str, optional (default='?')
The placeholder for missing values in the dataset.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the .mat file, with missing values
handled as specified and columns renamed if necessary.
Raises:
-------
Exception
Not Implemented.
"""
raise Exception('Not Implemented')
#def read_mat(url, class_col='label', tid_col='tid', missing='?'):
# raise Exception('Not Implemented')
[docs]
def ts2df(url, class_col='label', tid_col='tid', missing='?'):
"""
Converts a time series file from a given URL into a pandas DataFrame.
Parameters:
-----------
url : str
The URL pointing to the time series file to be read.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
tid_col : str, optional (default='tid')
The name of the column to be used as the unique trajectory identifier.
missing : str, optional (default='?')
The placeholder for missing values in the dataset.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the time series file, with missing values
handled as specified and columns renamed if necessary.
"""
from matdata.inc.ts_io import load_from_tsfile_to_dataframe
return load_from_tsfile_to_dataframe(url, replace_missing_vals_with=missing)
[docs]
def xes2df(url, class_col='label', tid_col='tid', opLabel='Converting XES', save=False, start_tid=1):
"""
Converts an XES (eXtensible Event Stream) file from a given URL into a pandas DataFrame.
Parameters:
-----------
url : str
The URL pointing to the XES file to be read.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
opLabel : str, optional (default='Converting XES')
A label describing the operation, useful for logging or display purposes.
save : bool, optional (default=False)
A flag indicating whether to save the DataFrame to a file after conversion.
start_tid : int, optional (default=1)
The starting value for trajectory identifiers as `tid_col` values need to be generated.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the XES file, with columns renamed if necessary.
"""
start_tid = start_tid-1
def getTrace(log, tid):
t = dict(log[tid].attributes)
return t
def getEvent(log, tid , j, attrs):
ev = dict(log[tid][j])
eqattr = set(attrs.keys()).intersection(set(ev.keys()))
for k in eqattr:
attrs[k+'_t'] = attrs.pop(k)
ev.update(attrs)
ev['tid'] = start_tid+tid+1
return ev
import pm4py
if isinstance(url, str):
log = pm4py.read_xes(url)
else:
log = pm4py.parse_event_log_string(url)
data = list(map(lambda tid:
pd.DataFrame(list(map(lambda j: getEvent(log, tid , j, getTrace(log, tid)), range(len(log[tid]))))),
tqdm(range(len(log)), desc=opLabel)))
df = pd.concat(data, ignore_index=True)
return df
#-------------------------------------------------------------------------->>
[docs]
def df2parquet(df, data_path, file="train", tid_col='tid', class_col='label', select_cols=None, opLabel='Writing Parquet'):
"""
Writes a pandas DataFrame to a Parquet file.
Parameters:
-----------
df : pandas.DataFrame
The DataFrame to be written to the Parquet file.
data_path : str
The directory path where the Parquet file will be saved.
file : str, optional (default='train')
The base name of the Parquet file (without extension).
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
select_cols : list of str, optional
A list of column names to be included in the Parquet file. If None, all columns are included.
opLabel : str, optional (default='Writing PARQUET')
A label describing the operation, useful for logging or display purposes.
Returns:
--------
pandas.DataFrame
The input DataFrame
"""
F = os.path.join(data_path, file+'.parquet')
print(opLabel+": " + F)
if not os.path.exists(data_path):
os.makedirs(data_path)
if not select_cols:
select_cols = list(df.columns)
df[select_cols].to_parquet(F)
print("Done.")
print(" --------------------------------------------------------------------------------")
return df
[docs]
def df2csv(df, data_path, file="train", tid_col='tid', class_col='label', select_cols=None, opLabel='Writing CSV'):
"""
Writes a pandas DataFrame to a CSV file.
Parameters:
-----------
df : pandas.DataFrame
The DataFrame to be written to the CSV file.
data_path : str
The directory path where the Parquet file will be saved.
file : str, optional (default='train')
The base name of the CSV file (without extension).
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
select_cols : list of str, optional
A list of column names to be included in the CSV file. If None, all columns are included.
opLabel : str, optional (default='Writing PARQUET')
A label describing the operation, useful for logging or display purposes.
Returns:
--------
pandas.DataFrame
The input DataFrame
"""
F = os.path.join(data_path, file+'.csv')
print(opLabel + ": " + F)
if not os.path.exists(data_path):
os.makedirs(data_path)
if not select_cols:
select_cols = list(df.columns)
df[select_cols].to_csv(F)
print("Done.")
print(" --------------------------------------------------------------------------------")
return df
[docs]
def df2zip(df, data_path, file, tid_col='tid', class_col='label', select_cols=None, opLabel='Writing MAT'):
"""
Writes a pandas DataFrame to a CSV file and compresses it into a ZIP archive.
Parameters:
-----------
df : pandas.DataFrame
The DataFrame to be written to the CSV file and then compressed into a ZIP archive.
data_path : str
The directory path where the ZIP archive will be saved.
file : str
The base name of the CSV file (without extension) to be compressed into the ZIP archive.
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
select_cols : list of str, optional
A list of column names to be included in the CSV file. If None, all columns are included.
opLabel : str, optional (default='Writing ZIP')
A label describing the operation, useful for logging or display purposes.
Returns:
--------
pandas.DataFrame
The input DataFrame
"""
EXT = '.r2'
if not os.path.exists(data_path):
os.makedirs(data_path)
zipf = ZipFile(os.path.join(data_path, file+'.zip'), 'w')
n = len(str(len(df.index)))
tids = df[tid_col].unique()
if not select_cols:
select_cols = list(df.columns)
select_cols = [x for x in select_cols if x not in [tid_col, class_col]]
def writeMAT(x):
filename = str(x).rjust(n, '0') + ' s' + str(x) + ' c' + str(df.loc[df[tid_col] == x][class_col].iloc[0]) + EXT
data = df[df.tid == x]
# Selected
if select_cols is not None:
data = data[select_cols]
# Remove tid and label:
data = data.drop([tid_col, class_col], axis=1, errors='ignore')
data.to_csv(filename, index=False, header=False)
zipf.write(filename)
os.remove(filename)
list(map(lambda x: writeMAT(x), tqdm(tids, desc=opLabel)))
zipf.close()
[docs]
def df2mat(df, folder, file, cols=None, mat_cols=None, desc_cols=None, label_columns=None, other_dsattrs=None,
tid_col='tid', class_col='label', opLabel='Converting MAT'):
"""
Converts a pandas DataFrame to a Multiple Aspect Trajectory .mat file and saves it to the specified folder.
Parameters:
-----------
df : pandas.DataFrame
The DataFrame to be converted to a .mat file.
folder : str
The directory where the .mat file will be saved.
file : str
The base name of the .mat file (without extension).
cols : list of str, optional
A list of column names from the DataFrame to include in the .mat file. If None, all columns are included.
mat_cols : list of str, optional
A list of column names representing the trajectory attibutes. If None, no columns are used.
desc_cols : list of str, optional
A dict of column descriptors to be included as descriptive metadata.
label_columns : list of str, optional
A list of column names that can be treated as labels in the .mat file.
other_dsattrs : dict, optional
A dictionary of additional dataset attributes to be included in the .mat file.
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
opLabel : str, optional (default='Converting MAT')
A label describing the operation, useful for logging or display purposes.
Returns:
--------
None
"""
if '.mat' in file:
url = os.path.join(folder, file)
file = file.replace('.mat', '')
else:
url = os.path.join(folder, file+'.mat')
if not cols:
cols = list(df.columns)
cols = [x for x in cols if x not in [tid_col, class_col]]
if mat_cols:
mat_cols = [x for x in mat_cols if x not in [tid_col, class_col]]
f = open(url, "w")
f.write("# Dataset: " + os.path.basename(folder) + ' (comment description)\n')
f.write("@problemName " + os.path.basename(folder) + '\n')
if label_columns:
f.write('@labelColumns ' + (','.join(label_columns)) + '\n')
f.write("@missing "+ str(df.apply(lambda ts: '?' in ts.values, axis=1).any() or df.isnull().any().any())+'\n')
f.write("@aspects " + str(len(cols)) + '\n')
f.write('@aspectNames ' + (','.join(cols)) + '\n')
if mat_cols:
f.write('@trajectoryAspectNames ' + (','.join(mat_cols)) + '\n')
if not desc_cols:
# dictionary in the format: {'aspectName': 'type', 'aspectName': 'type'}
desc_cols = descTypes(df)
f.write('@aspectDescriptor ' + (','.join(':'.join((key,val)) for (key,val) in desc_cols.items())) + '\n')
if other_dsattrs:
for k,v in other_dsattrs:
f.write('@'+k+' ' + (','.join(v)) + '\n')
f.write("@data\n")
def getTrace(df, tid):
s = ''
s += '@trajectory \n' + str(tid) + ',' + str(df[class_col].values[0]) + '\n'
if mat_cols:
s += '@trajectoryAspects\n'
s += df[mat_cols][0:1].to_csv(index=False,header=False, quotechar='"')
s += '@trajectoryPoints\n'
s += df[cols].to_csv(index=False,header=False, quotechar='"')
return s
list(map(lambda tid: f.write(getTrace(df[df[tid_col] == tid], tid)),
tqdm(df[tid_col].unique(), desc=opLabel)))
f.close()
#-------------------------------------------------------------------------->>
def read_zip(zipFile, cols=None, class_col='label', tid_col='tid', missing='?', opLabel='Reading ZIP'):
### [Private helper function]
data = pd.DataFrame()
with zipFile as z:
files = z.namelist()
files.sort()
def readCSV(filename):
if cols is not None:
df = pd.read_csv(z.open(filename), names=cols, na_values=missing)
else:
df = pd.read_csv(z.open(filename), header=None, na_values=missing)
df[tid_col] = filename.split(" ")[1][1:]
df[class_col] = filename.split(" ")[2][1:-3]
return df
data = list(map(lambda filename: readCSV(filename), tqdm(z.namelist(), desc=opLabel)))
data = pd.concat(data)
return data
#-------------------------------------------------------------------------->>
[docs]
def zip2csv(folder, file, cols, class_col = 'label', tid_col='tid', missing='?'):
"""
Extracts and compile Trajectory CSV files from a ZIP archive and converts it into a pandas DataFrame.
Parameters:
-----------
folder : str
The directory path where the ZIP archive is located, and destination to the CSV resulting file.
file : str
The name of the ZIP archive file (with or without extension).
cols : list of str
A list of column names to be included in the DataFrame.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
missing : str, optional (default='?')
The placeholder for missing values in the CSV file.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the extracted CSV file, with missing values
handled as specified and columns renamed if necessary.
"""
data = zip2df(folder, file, cols, class_col, tid_col, missing)
print("Saving dataset as: " + os.path.join(folder, file+'.csv'))
data.to_csv(os.path.join(folder, file+'.csv'), index = False)
print("Done.")
print(" --------------------------------------------------------------------------------")
return data
[docs]
def zip2arf(folder, file, cols, tid_col='tid', class_col = 'label', missing='?', opLabel='Reading CSV'):
"""
Extracts a CSV file from a ZIP archive and converts it into an ARFF (Attribute-Relation File Format) file.
Parameters:
-----------
folder : str
The directory path where the ZIP archive is located.
file : str
The name of the ZIP archive file (with or without extension).
cols : list of str
A list of column names to be included in the ARFF file.
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
missing : str, optional (default='?')
The placeholder for missing values in the CSV file.
opLabel : str, optional (default='Reading CSV')
A label describing the operation, useful for logging or display purposes.
Returns:
--------
pandas.DataFrame
A DataFrame containing the data from the extracted ZIP file, with missing values
handled as specified and columns renamed if necessary.
"""
data = pd.DataFrame()
print("Converting "+file+" data from... " + folder)
if '.zip' in file:
url = os.path.join(folder, file)
else:
url = os.path.join(folder, file+'.zip')
with ZipFile(url) as z:
# for filename in z.namelist():
# # data = filename.readlines()
# df = pd.read_csv(z.open(filename), names=cols, na_values=missing)
# # print(filename)
# df[tid_col] = filename.split(" ")[1][1:]
# df[class_col] = filename.split(" ")[2][1:-3]
# data = pd.concat([data,df])
def readCSV(filename):
# data = filename.readlines()
df = pd.read_csv(z.open(filename), names=cols, na_values=missing)
# print(filename)
df[tid_col] = filename.split(" ")[1][1:]
df[class_col] = filename.split(" ")[2][1:-3]
return df
data = list(map(lambda filename: readCSV(filename), tqdm(z.namelist(), desc=opLabel)))
data = pd.concat(data)
print("Done.")
print("Saving dataset as: " + os.path.join(folder, file+'.csv'))
data.to_csv(os.path.join(folder, file+'.csv'), index = False)
print("Done.")
print(" --------------------------------------------------------------------------------")
return data
[docs]
def any2ts(data_path, folder, file, cols=None, tid_col='tid', class_col = 'label', opLabel='Converting TS'):
"""
Converts data from various formats (CSV, Parquet, etc.) to a time series format.
Parameters:
-----------
data_path : str
The directory path where the data files are located.
folder : str
The folder containing the data file to be converted.
file : str
The name of the data file to be converted.
cols : list of str, optional
A list of column names to be included in the time series data.
tid_col : str, optional (default='tid')
The name of the column to be used as the trajectory identifier.
class_col : str, optional (default='label')
The name of the column to be treated as the class/label column.
opLabel : str, optional (default='Converting TS')
A label describing the operation, useful for logging or display purposes.
Returns:
--------
pandas.DataFrame
A DataFrame containing the time series data, with trajectory identifier, class label, and specified columns.
"""
print("Converting "+file+" data from... " + data_path + " - " + folder)
data = readDataset(data_path, folder, file, class_col)
file = file.replace('specific_', '')
tsName = os.path.join(data_path, folder, folder+'_'+file.upper()+'.ts')
tsDesc = os.path.join(data_path, folder, folder+'.md')
print("Saving dataset as: " + tsName)
if not cols:
cols = list(data.columns)
cols = [x for x in cols if x not in [tid_col, class_col]]
f = open(tsName, "w")
if os.path.exists(tsDesc):
fd = open(tsDesc, "r")
for line in fd:
f.write("# " + line)
# fd.close()
f.write("#\n")
f.write("@problemName " + folder + '\n')
f.write("@timeStamps false")
f.write("@missing "+ str('?' in data)+'\n')
f.write("@univariate "+ ('false' if len(cols) > 1 else 'true') +'\n')
f.write("@dimensions " + str(len(cols)) + '\n')
f.write("@equalLength false" + '\n')
f.write("@seriesLength " + str(len(data[data[tid_col] == data[tid_col][0]])) + '\n')
f.write("@classLabel true " + ' '.join([str(x).replace(' ', '_') for x in list(data[class_col].unique())]) + '\n')
f.write("@data\n")
# for tid in data[tid_col].unique():
def writeLine(tid):
df = data[data[tid_col] == tid]
line = ''
for col in cols:
line += ','.join(map(str, list(df[col]))) + ':'
f.write(line + str(df[class_col].unique()[0]) + '\n')
list(map(lambda tid: writeLine(tid), tqdm(data[tid_col].unique(), desc=opLabel)))
f.write('\n')
f.close()
print("Done.")
print(" --------------------------------------------------------------------------------")
return data
# --------------------------------------------------------------------------------
def descTypes(df):
### [Private helper function]
def getType(k, t):
if t.name == 'category':
return 'nominal'
elif t is int or t is float or np.issubdtype(t, np.number):
return 'numeric'
elif 'space' in k or 'lat_lon' in k:
return 'space2d'
else:
return 'nominal'
return {k: getType(k, df.dtypes[k]) for k in df.columns}