# -*- coding: utf-8 -*-
"""
**Multiple Aspect Trajectory Tools Framework**
*MAT-data: Data Preprocessing for Multiple Aspect Trajectory Data Mining*
The present application offers a tool, to support the user in the classification task of multiple aspect trajectories,
specifically for extracting and visualizing the movelets, the parts of the trajectory that better discriminate a class.
It integrates into a unique platform the fragmented approaches available for multiple aspects trajectories and in
general for multidimensional sequence classification into a unique web-based and python library system. Offers both
movelets visualization and classification methods.
Created on Dec, 2023
Copyright (C) 2023, License GPL Version 3 or superior (see LICENSE file)
@author: Tarlis Portela
----
"""
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import math
import random
import itertools
from matdata.preprocess import writeFile, featuresJSON
# --------------------------------------------------------------------------------
[docs]
def scalerSamplerGenerator(
Ns=[100, 10],
Ms=[10, 10],
Ls=[8, 10],
Cs=[2, 10],
random_seed=1,
fileprefix='scalability',
fileposfix='train',
cols_for_sampling = ['space','time','day','rating','price','weather','root_type','type'],
save_to=None,
base_data=None,
save_desc_files=True,
outformats=['csv']):
"""
Generates trajectory datasets based on real data.
Parameters:
-----------
Ns : list of int, optional
Parameters to scale the number of trajectories.
List of 2 values: starting number, number of elements (default [100, 10])
Ms : list of int, optional
Parameters to scale the size of trajectories.
List of 2 values: starting number, number of elements (default [10, 10])
Ls : list of int, optional
Parameters to scale the number of attributes (* doubles the columns).
List of 2 values: starting number, number of elements (default [8, 10])
Cs : list of int, optional
Parameters to scale the number of classes.
List of 2 values: starting number, number of elements (default [2, 10])
random_seed : int, optional
Random seed (default 1)
fileprefix : str, optional
Output filename prefix (default 'scalability')
fileposfix : str, optional
Output filename postfix (default 'train')
cols_for_sampling : list or dict, optional
Columns to add in the generated dataset.
Default: ['space', 'time', 'day', 'rating', 'price', 'weather', 'root_type', 'type'].
If a dictionary is provided in the format: {'aspectName': 'type', 'aspectName': 'type'},
it is used when providing base_data and saving .MAT.
save_to : str or bool, optional
Destination folder to save, or False if not to save CSV files (default False)
base_data : DataFrame, optional
DataFrame of trajectories to use as a base for sampling data.
Default: None (uses example data)
save_desc_files : bool, optional
True if to save the .json description files, False otherwise (default True)
outformats : list, optional
Output file formats for saving (default ['csv'])
Returns:
--------
None
"""
assert Ns[0] > 0, 'N > 0'
assert Ms[0] > 0, 'M > 0'
assert Cs[0] > 0, 'C > 0'
assert Ls[0] > 0, 'L > 0'
assert Ns[0] >= Cs[0], 'N >= C'
assert save_to, 'save_to param must be set.'
# Random Seed
np.random.seed(seed=random_seed)
random.seed(random_seed)
df, cols_for_sampling, desc_cols = getSamplingData(base_data, cols_for_sampling)
pbar = tqdm(range(Ns[1] + Ms[1] + Ls[1] + Cs[1]))
Ns = getScale(Ns[0], Ns[1])
Ms = getScale(Ms[0], Ms[1])
La = getScale(Ls[0], Ls[1])
Cs = getScale(Cs[0], Cs[1])
miN = getMiddleE(Ns)
miM = getMiddleE(Ms)
miL = len(cols_for_sampling) #getMiddleE(La)
miC = getMiddleE(Cs)
print('N ::', 'fix. value:', '\t', miN, '\tscale:\t', Ns)
print('M ::', 'fix. value:', '\t', miM, '\tscale:\t', Ms)
print('L ::', 'fix. value:', '\t', miL, '\tscale:\t', La)
print('C ::', 'fix. value:', '\t', miC, '\tscale:\t', Cs)
# 1 - Scale attributes (reshape columns), fixed trajectories, points, and classes:
cols = cols_for_sampling.copy()
prefix = fileprefix #+ '_L'
for i in range(Ls[1]):
#if len(cols) == miL:
# cols_for_sampling = cols
samplerGenerator(miN, miM, miC, random_seed, fileprefix, fileposfix, cols, save_to, df, outformats)
pbar.update(1)
if save_to and save_desc_files:
featuresJSON(desc_cols, 1, file=os.path.join(save_to, '_'.join([fileprefix,str(len(cols)),'attrs'])+ ".json"))
featuresJSON(desc_cols, 2, file=os.path.join(save_to, '_'.join([fileprefix,str(len(cols)),'attrs'])+ "_hp.json"))
if i < Ls[1]-1:
df_ = df[cols].copy()
df_ = df_.add_suffix('_'+str(i+1))
cols = cols + list(df_.columns)
df = pd.concat([df, df_], axis=1)
# 2 - Scale trajectories, fixed points, attributes, and classes
#prefix = fileprefix + '_N'
for i in Ns:
pbar.update(1)
if i == miN:
continue
samplerGenerator(i,miM,miC,random_seed, fileprefix, fileposfix, cols_for_sampling, save_to, df, outformats)
# 3 - Scale points, fixed trajectories, attributes, and classes
#prefix = fileprefix + '_M'
for i in Ms:
pbar.update(1)
if i == miM:
continue
samplerGenerator(miN,i,miC,random_seed, fileprefix, fileposfix, cols_for_sampling, save_to, df, outformats)
# 4 - Scale classes, fixed trajectories, points, and attributes
#prefix = fileprefix + '_C'
for i in Cs:
pbar.update(1)
if i == miC:
continue
samplerGenerator(miN,miM,i,random_seed, fileprefix, fileposfix, cols_for_sampling, save_to, df, outformats)
[docs]
def samplerGenerator(
N=10,
M=50,
C=1,
random_seed=1,
fileprefix='sample',
fileposfix='train',
cols_for_sampling = ['space','time','day','rating','price','weather','root_type','type'],
save_to=False,
base_data=None,
outformats=['csv']):
'''
Function to generate trajectories based on real data.
Parameters:
-----------
N : int, optional
Number of trajectories (default 10)
M : int, optional
Size of trajectories, number of points (default 50)
C : int, optional
Number of classes (default 1)
random_seed : int, optional
Random seed (default 1)
cols_for_sampling : list, optional
Columns to add in the generated dataset.
Default: ['space', 'time', 'day', 'rating', 'price', 'weather', 'root_type', 'type'].
save_to : str or bool, optional
Destination folder to save, or False if not to save CSV files (default False)
fileprefix : str, optional
Output filename prefix (default 'sample')
fileposfix : str, optional
Output filename postfix (default 'train')
base_data : DataFrame, optional
DataFrame of trajectories to use as a base for sampling data.
Default: None (uses example data)
outformats : list, optional
Output file formats for saving (default ['csv'])
Returns:
--------
pandas.DataFrame
The generated dataset.
'''
assert N > 0, 'N > 0'
assert M > 0, 'M > 0'
assert C > 0, 'C > 0'
assert N >= C, 'N >= C'
# Random Seed
np.random.seed(seed=random_seed)
random.seed(random_seed)
df, cols_for_sampling, desc_cols = getSamplingData(base_data, cols_for_sampling)
#cols_for_sampling = ['lat_lon','time','day','price','weather','type']
df_for_sampling = df[cols_for_sampling]
# Number of Trajectories per class
n = int(N / C)
new_df = pd.concat( list(map(lambda j: sample_set(df_for_sampling, n, M, 'C'+str(j+1), j), range(C))) )
if len(new_df['tid'].unique()) < N:
df_ = sample_trajectory(df_for_sampling, M, N)
df_['label'] = 'C'+str(C)
new_df = pd.concat([new_df, df_])
# Orders by tid, day e time
new_df = new_df.sort_values(['tid','day','time'])
# Reset indexes
new_df.reset_index(drop=True, inplace=True)
# Output file:
if save_to:
if not os.path.exists(save_to):
os.makedirs(save_to)
filename = '_'.join([fileprefix,
str(N),'trajectories',
str(M),'points',
str(len(cols_for_sampling)),'attrs',
str(C),'labels',
fileposfix])
for outType in outformats:
writeFile(save_to, new_df, filename, 'tid', 'label', ['tid', 'label']+cols_for_sampling, None, desc_cols, outType)
#filename += '.csv'
#new_df.to_csv( os.path.join(save_to, filename), index=False)
return new_df
def getSamplingData(base_data, cols_for_sampling):
if base_data is None:
base_data = os.path.join(os.path.dirname(__file__), 'assets', 'sample', 'Foursquare_Sample.csv')
df = pd.read_csv(base_data).dropna()
df = df.rename(columns={"lat_lon": "space"})
cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day'] = pd.Categorical(df['day'], categories=cats, ordered=True)
desc_cols = {'space':'space2d', 'time':'time', 'day':'nominal', 'rating':'numeric', 'price':'numeric',
'weather':'nominal', 'root_type':'nominal', 'type':'nominal'}
else:
df = base_data
if type(cols_for_sampling) is dict:
desc_cols = cols_for_sampling.copy()
cols_for_sampling = list(cols_for_sampling.keys())
else:
desc_cols = None
return df, cols_for_sampling, desc_cols
[docs]
def scalerRandomGenerator(
Ns=[100, 10],
Ms=[10, 10],
Ls=[8, 10],
Cs=[2, 10],
random_seed=1,
fileprefix='scalability',
fileposfix='train',
attr_desc=None,
save_to=None,
save_desc_files=True,
outformats=['csv']):
'''
Function to generate trajectory datasets based on random data.
Parameters:
-----------
Ns : list of int, optional
Parameters to scale the number of trajectories.
List of 2 values: starting number, number of elements (default [100, 10])
Ms : list of int, optional
Parameters to scale the size of trajectories.
List of 2 values: starting number, number of elements (default [10, 10])
Ls : list of int, optional
Parameters to scale the number of attributes (* doubles the columns).
List of 2 values: starting number, number of elements (default [8, 10])
Cs : list of int, optional
Parameters to scale the number of classes.
List of 2 values: starting number, number of elements (default [2, 10])
random_seed : int, optional
Random seed (default 1)
attr_desc : list, optional
Data type intervals to generate attributes as a list of descriptive dicts.
Default: None (uses default types)
save_to : str or bool, optional
Destination folder to save, or False if not to save CSV files (default False)
fileprefix : str, optional
Output filename prefix (default 'sample')
fileposfix : str, optional
Output filename postfix (default 'train')
save_desc_files : bool, optional
True if to save the .json description files, False otherwise (default True)
outformats : list, optional
Output file formats for saving (default ['csv'])
Returns:
--------
None
'''
assert Ns[0] > 0, 'N > 0'
assert Ms[0] > 0, 'M > 0'
assert Cs[0] > 0, 'C > 0'
assert Ls[0] > 0, 'L > 0'
assert Ns[0] >= Cs[0], 'N >= C'
assert save_to, 'save_to param must be set.'
# Random Seed
np.random.seed(seed=random_seed)
random.seed(random_seed)
pbar = tqdm(range(Ns[1] + Ms[1] + Ls[1] + Cs[1]))
Ns = getScale(Ns[0], Ns[1])
Ms = getScale(Ms[0], Ms[1])
La = getScale(Ls[0], Ls[1])
Cs = getScale(Cs[0], Cs[1])
miN = getMiddleE(Ns)
miM = getMiddleE(Ms)
miL = Ls[0] #getMiddleE(La)
miC = getMiddleE(Cs)
print('N ::', 'fix. value:', '\t', miN, '\tscale:\t', Ns)
print('M ::', 'fix. value:', '\t', miM, '\tscale:\t', Ms)
print('L ::', 'fix. value:', '\t', miL, '\tscale:\t', La)
print('C ::', 'fix. value:', '\t', miC, '\tscale:\t', Cs)
if not attr_desc:
attr_desc = default_types()[:Ls[0]]
generators = instantiate_generators(attr_desc)
# 1 - Scale attributes (reshape columns), fixed trajectories, points, and classes:
#prefix = fileprefix #+ '_L'
for i in La:
randomGenerator(miN, miM, i, miC, random_seed, fileprefix, fileposfix, cycleGenerators(i, generators), save_to, outformats)
pbar.update(1)
if save_to and save_desc_files:
desc_cols = {g.name: g.descType() for g in cycleGenerators(i, generators)}
featuresJSON(desc_cols, 1, file=os.path.join(save_to, '_'.join([fileprefix,str(i),'attrs'])+ ".json"))
featuresJSON(desc_cols, 2, file=os.path.join(save_to, '_'.join([fileprefix,str(i),'attrs'])+ "_hp.json"))
# 2 - Scale trajectories, fixed points, attributes, and classes
#prefix = fileprefix + '_N'
for i in Ns:
pbar.update(1)
if i == miN:
continue
randomGenerator(i, miM, miL, miC, random_seed, fileprefix, fileposfix, generators, save_to, outformats)
# 3 - Scale points, fixed trajectories, attributes, and classes
#prefix = fileprefix + '_M'
for i in Ms:
pbar.update(1)
if i == miM:
continue
randomGenerator(miN, i, miL, miC, random_seed, fileprefix, fileposfix, generators, save_to, outformats)
# 4 - Scale classes, fixed trajectories, points, and attributes
#prefix = fileprefix + '_C'
for i in Cs:
pbar.update(1)
if i == miC:
continue
randomGenerator(miN, miM, miL, i, random_seed, fileprefix, fileposfix, generators, save_to, outformats)
[docs]
def randomGenerator(
N=10,
M=50,
L=10,
C=10,
random_seed=1,
fileprefix='random',
fileposfix='train',
attr_desc=None,
save_to=False,
outformats=['csv']):
'''
Function to generate trajectories based on random data.
Parameters:
-----------
N : int, optional
Number of trajectories (default 10)
M : int, optional
Size of trajectories (default 50)
L : int, optional
Number of attributes (default 10)
C : int, optional
Number of classes (default 10)
random_seed : int, optional
Random Seed (default 1)
attr_desc : list of dict, optional
Data type intervals to generate attributes as a list of descriptive dicts.
Default: None (uses default types)
OR a list of instances of AttributeGenerator
save_to : str or bool, optional
Destination folder to save, or False if not to save CSV files (default False)
fileprefix : str, optional
Output filename prefix (default 'sample')
fileposfix : str, optional
Output filename postfix (default 'train')
outformats : list, optional
Output file formats for saving (default ['csv'])
Returns:
--------
pandas.DataFrame
The generated dataset.
'''
assert N > 0, 'N > 0'
assert M > 0, 'M > 0'
assert L > 0, 'L > 0'
assert C > 0, 'C > 0'
assert N >= C, 'N >= C'
# Random Seed
np.random.seed(seed=random_seed)
random.seed(random_seed)
if not attr_desc:
attr_desc = default_types()
if isinstance(attr_desc[0], AttributeGenerator):
generators = attr_desc
else:
generators = instantiate_generators(attr_desc)
# Number of Trajectories per class
n = int(N / C)
new_df = pd.concat( list(map(lambda j: random_set(n, M, L, 'C'+str(j+1), j, generators), range(C))) )
if len(new_df['tid'].unique()) < N:
df_ = random_trajectory(M, L, N, generators)
df_['label'] = 'C'+str(C)
new_df = pd.concat([new_df, df_])
# Orders by tid, day e time
#new_df = new_df.sort_values(['tid','day','time'])
# Reset indexes
new_df.reset_index(drop=True, inplace=True)
# Output file:
if save_to:
if not os.path.exists(save_to):
os.makedirs(save_to)
filename = '_'.join([fileprefix,
str(N),'trajectories',
str(M),'points',
str(L),'attrs',
str(C),'labels',
fileposfix])
desc_cols = {g.name: g.descType() for g in generators}
for outType in outformats:
writeFile(save_to, new_df, filename, 'tid', 'label', list(new_df.columns), None, desc_cols, outType)
#filename += '.csv'
#new_df.to_csv( os.path.join(save_to, filename), index=False)
return new_df
# print('Not implemented.')
# --------------------------------------------------------------------------------
def default_types():
return [
{'name': 'space', 'atype': 'space', 'method': 'grid_cell', 'interval': [(0.0,1000.0), (0.0,1000.0)]},
{'name': 'time', 'atype': 'time', 'method': 'random', 'interval': [0, 1440]},
{'name': 'n1', 'atype': 'numeric', 'method': 'random', 'interval': [-1000, 1000]},
{'name': 'n2', 'atype': 'numeric', 'method': 'random', 'interval': [0.0, 1000.0]},
{'name': 'nominal', 'atype': 'nominal', 'method': 'random', 'n': 1000},
{'name': 'day', 'atype': 'day', 'method': 'random', 'interval': ['Monday', 'Tuesday', 'Thursday',
'Friday', 'Saturday', 'Sunday', 'Wednesday']},
{'name': 'weather', 'atype': 'weather', 'method': 'random', 'interval': ['Clear', 'Clouds', 'Fog', 'Unknown', 'Rain', 'Snow']},
#{'name': 'poi', 'atype': 'poi', 'method': 'serial', 'n': 100, 'dependency': 'space'},
{'name': 'category', 'atype': 'category', 'method': 'random', 'dependency': 'space',
'interval': ['Residence', 'Food', 'Travel & Transport', 'Professional & Other Places', 'Shop & Service',
'Outdoors & Recreation', 'College & University', 'Arts & Entertainment', 'Nightlife Spot', 'Event']},
]
def instantiate_generators(attr_desc=default_types()):
return list(map(lambda g: AttributeGenerator(**g), attr_desc))
def getScale(start=100, n_ele=10):
return [start] + (getScale(start+start, n_ele-1) if n_ele-1 else [])
def getMiddleE(X):
return X[int((len(X) - 1)/2)]
def sample_trajectory( df, M, tid ):
df_ = df.sample(M)
df_.insert(0,'tid', tid+1)
return df_
def sample_set(df_for_sampling, N, M, label, j):
# Number of Trajectories per class
#n = int(N / C)
# Creates the set of N trajectories of size M
new_df = pd.concat( list(map(lambda i: sample_trajectory(df_for_sampling, M, i+j*N), range(N) )) )
new_df['label'] = label
# Orders by tid, day e time
new_df = new_df.sort_values(['tid','day','time'])
# Reset indexes
new_df.reset_index(drop=True, inplace=True)
return new_df
def cycleGenerators(L, generators):
return list(itertools.islice(itertools.cycle(generators), L))
def random_trajectory(M, L, tid, generators):
g = cycleGenerators(L, generators)
df_ = pd.concat( list(map(lambda i: pd.Series(g[i].nextn(M), name='a'+str(i+1)+'_'+g[i].name), range(L))), axis=1)
df_.insert(0,'tid', tid+1)
return df_
def random_set(N, M, L, label, j, generators):
# Creates the set of N trajectories of size M
new_df = pd.concat( list(map(lambda i: random_trajectory(M, L, i+j*N, generators), range(N) )) )
new_df['label'] = label
# Orders by tid, day e time
#new_df = new_df.sort_values(['tid'])
# Reset indexes
new_df.reset_index(drop=True, inplace=True)
return new_df
# --------------------------------------------------------------------------------
class AttributeGenerator:
def __init__(self, name='attr', atype='nominal', method='random', interval=None, n=-1, dependency=None,
cellSize=1, adjacents=None, precision=2):
self.name = name
self.atype = atype
self.method = method
self.dependency = dependency
if atype in ['nominal', 'day', 'weather', 'poi', 'category']:
self.generator = NominalGenerator(method, n, interval)
elif atype in ['time', 'numeric']:
self.generator = NumericGenerator(method, interval[0], interval[1], precision)
elif atype in ['space']:
self.generator = SpatialGrid2D(interval[0], interval[1], cellSize, adjacents, precision)
def descType(self):
if self.atype in ['nominal', 'day', 'weather', 'poi', 'category']:
return 'nominal'
elif self.atype in ['space']:
return 'space2d'
else:
return self.atype
def next(self):
return self.generator.next()
def nextn(self, n):
return self.generator.nextn(n)
class NominalGenerator:
@staticmethod
def nominalInterval(n):
def getNominalCombs(ncomb=1):
return [''.join(comb) for comb in itertools.product((lambda x, i: [chr(ord('A')+y) for y in range(i)])('A', 26), repeat=ncomb)]
i = 1
ls = getNominalCombs(i)
while len(ls) < n:
i += 1
ls = ls + getNominalCombs(i)
return ls[:n]
def __init__(self, method='random', n=50, interval=None):
self.method = method
self.n = n
self.interval = self.nominalInterval(n) if interval is None else interval
self.pos = -1
def next(self):
if self.method=='random':
return random.choice(self.interval)
elif self.method=='sequential':
self.pos = self.pos+1 if self.pos < self.n-1 else 0
return self.interval[self.pos]
elif self.method=='serial':
raise NotImplementedError('NominalGenerator.method==serial')
def nextn(self, n):
return list(map(lambda i: self.next(), range(n)))
class NumericGenerator:
def __init__(self, method='random', start=0, end=100, precision=2):
self.method = method
self.start = start
self.end = end
self.last = start
self.precision = precision
#self.pos = -1
def next(self):
if self.method=='random':
if isinstance(self.start, int):
return random.randint(self.start, self.end)
else:
return round(random.uniform(self.start, self.end), self.precision)
elif self.method=='serial':
raise NotImplementedError('NumericGenerator.method==serial')
#return 0#self.interval[self.pos]
def nextn(self, n):
return list(map(lambda i: self.next(), range(n)))
class SpatialGrid2D:
SPATIAL_ADJACENTS_2 = [
( -2, -2 ), ( -2, -1 ), ( -2, 0 ), ( -2, 1 ), ( -2, 2 ),
( -1, -2 ), ( -1, -1 ), ( -1, 0 ), ( -1, 1 ), ( -2, 2 ),
( 0, -2 ), ( 0, -1 ), ( 0, 1 ), ( -2, 2 ),
( 1, -2 ), ( 1, -1 ), ( 1, 0 ), ( 1, 1 ), ( -2, 2 ),
( 2, -2 ), ( 2, -1 ), ( 2, 0 ), ( 2, 1 ), ( -2, 2 )
]
SPATIAL_ADJACENTS_1 = [
( -1, -1 ), ( -1, 0 ), ( -1, 1 ),
( 0, -1 ), ( 0, 1 ),
( 1, -1 ), ( 1, 0 ), ( 1, 1 ),
]
def __init__(self, X=(1,5), Y=(1,5), cellSize=1, spatial_adjacents=None, precision=2, dependency=[]):
self.X = X
self.Y = Y
#self.spatialThreshold = spatialThreshold # 0.00142
#self.cellSize = self.spatialThreshold * cellSizeFactor #0.7071
self.cellSize = cellSize
self.precision = precision
if not spatial_adjacents:
self.SPATIAL_ADJACENTS = self.SPATIAL_ADJACENTS_1
else:
self.SPATIAL_ADJACENTS = spatial_adjacents
def size(self):
return int(((self.X[1] - self.X[0]) / self.cellSize) * ((self.Y[1] - self.Y[0]) / self.cellSize))
def position(self, x, y):
if x < self.X[0] or x > self.X[1] or y < self.Y[0] or y > self.Y[1]:
return None
#to include the edges:
if x == self.X[1]:
x = x - (self.cellSize/2)
if y == self.Y[1]:
y = y - (self.cellSize/2)
return ( int(math.floor(x / self.cellSize)) , int(math.floor(y / self.cellSize)) )
def adjacents(self, cell):
return list(filter(lambda ajc: ajc[0] >= self.X[0] and ajc[0] <= (self.X[1]-self.cellSize) and
ajc[1] >= self.Y[0] and ajc[1] <= (self.Y[1]-self.cellSize),
map(lambda ajc: (cell[0]+(ajc[0]*self.cellSize), cell[1]+(ajc[1]*self.cellSize)), self.SPATIAL_ADJACENTS)))
def randomRoute(self, startCell, n):
route = [startCell]
for i in range(n-1):
route.append( random.choice(self.adjacents(route[-1])) )
return route
def next(self):
return round(random.uniform(self.X[0], self.X[1]+self.cellSize), self.precision), \
round(random.uniform(self.Y[0], self.Y[1]+self.cellSize), self.precision)
def nextin(self, cell): # next in cell
return round(random.uniform(cell[0], cell[0]+self.cellSize), self.precision), \
round(random.uniform(cell[1], cell[1]+self.cellSize), self.precision)
def nextn(self, n):
return list(map(lambda i: self.text(self.next()), range(n)))
def text(self, point):
return str(point[0]) + ' ' + str(point[1])
# --------------------------------------------------------------------------------