from __future__ import division
from lcc.utils.data_analysis import to_PAA, normalize
import numpy as np
[docs]class SAX(object):
"""
This class manages symbolic representation of data series via
Symbolic Aggregate approXimation method. It translates
series of data to a words, which can then be compared with other
such words in symbolic distance space.
Attributes
-----------
word_size : int
Number of letters in transformed word
alphabet_size : int
Size of alphabet counted from A (3 means A, B, C)
scaling_factor : int, float
Scaling factor can be used to scale result dissimilarity of
two words created from light curves of different lengths
beta : list
Breakpoints for given alphabets size
"""
MIN_ALPH_SIZE = 3
MAX_ALPH_SIZE = 20
A_OFFSET = ord('a')
def __init__(self, word_size=8, alphabet_size=10, scaling_factor=1):
"""
Parameters
-----------
word_size : int
Number of letters in transformed word
alphabet_size : int
Size of alphabet counted from A (3 means A, B, C)
scaling_factor : int, float
Scaling factor can be used to scale result dissimilarity of
two words created from light curves of different lengths
"""
if (alphabet_size < self.MIN_ALPH_SIZE or
alphabet_size > self.MAX_ALPH_SIZE):
raise DictionarySizeIsNotSupported("%i " % alphabet_size)
self.word_size = word_size
self.alphabet_size = alphabet_size
self.beta = self._getBreakpoints()[str(int(self.alphabet_size))]
self.build_letter_compare_dict()
self.scaling_factor = scaling_factor
[docs] def to_letter_rep(self, x):
"""
Function takes a series of data, x, and transforms it
to a string representation.
Parameters
----------
x : list, iterable
Data series
Returns
-------
str
SAX word
list
Indices
"""
paaX, indices = to_PAA(normalize(x), self.word_size)
self.scaling_factor = np.sqrt(len(x) / self.word_size)
return self.alphabetize(paaX), indices
[docs] def alphabetize(self, paaX):
"""
Converts the Piecewise Aggregate Approximation of x
to a series of letters.
Parameters
---------
paaX : list, iterable
Data series (list of numbers)
Returns
-------
str
SAX word
"""
alphabetizedX = ''
for i in range(0, len(paaX)):
letterFound = False
for j in range(0, len(self.beta)):
if paaX[i] < self.beta[j]:
alphabetizedX += chr(self.A_OFFSET + j)
letterFound = True
break
if not letterFound:
alphabetizedX += chr(self.A_OFFSET + len(self.beta))
return alphabetizedX
[docs] def compare_strings(self, sA, sB):
"""
Compares two strings based on individual letter distances.
Parameters
----------
sA : str
Word to compare
aB : str
Word to compare
Returns
-------
float
Dissimilarity of two words
"""
if len(sA) != len(sB):
raise Exception("StringsAreDifferentLength")
list_letters_a = [x for x in sA]
list_letters_b = [x for x in sB]
mindist = 0.0
for i in range(0, len(list_letters_a)):
mindist += self.compare_letters(
list_letters_a[i], list_letters_b[i])**2
mindist = self.scaling_factor * np.sqrt(mindist)
return mindist
[docs] def compare_letters(self, la, lb):
"""
Compare two letters based on letter distance return distance between
Parameters
---------
la : str
First letter
lb : str
Second letter
Returns
-------
float
Distance between two letters
"""
return self.compare_dict[la + lb]
[docs] def build_letter_compare_dict(self):
"""
Builds up the lookup table to determine numeric distance
between two letters given an alphabet size.
Returns
-------
None
"""
number_rep = range(0, int(self.alphabet_size))
letters = [chr(x + self.A_OFFSET) for x in number_rep]
self.compare_dict = {}
for i in range(0, len(letters)):
for j in range(0, len(letters)):
if np.abs(number_rep[i] - number_rep[j]) <= 1:
self.compare_dict[letters[i] + letters[j]] = 0
else:
high_num = np.max([number_rep[i], number_rep[j]]) - 1
low_num = np.min([number_rep[i], number_rep[j]])
self.compare_dict[
letters[i] + letters[j]] = self.beta[high_num] - self.beta[low_num]
def _sliding_window(self, x, window_size, overlapping_fraction=None):
"""
Parameters
----------
x : list, iterable
"""
self.windowSize = window_size
if not overlapping_fraction:
overlapping_fraction = 0.01
overlap = self.windowSize * overlapping_fraction
move_size = int(self.windowSize - overlap)
if move_size < 1:
raise OverlapSpecifiedIsNotSmallerThanWindowSize
move_size = 5
ptr = 0
n = len(x)
window_indices = []
string_rep = []
while ptr < n - self.windowSize + 1:
this_sub_range = x[ptr:ptr + self.windowSize]
this_string_rep, _ = self.to_letter_rep(this_sub_range)
string_rep.append(this_string_rep)
window_indices.append((ptr, ptr + self.windowSize))
ptr += move_size
return string_rep, window_indices
def _getBreakpoints(self):
return {'3': [-0.43, 0.43],
'4': [-0.67, 0, 0.67],
'5': [-0.84, -0.25, 0.25, 0.84],
'6': [-0.97, -0.43, 0, 0.43, 0.97],
'7': [-1.07, -0.57, -0.18, 0.18, 0.57, 1.07],
'8': [-1.15, -0.67, -0.32, 0, 0.32, 0.67, 1.15],
'9': [-1.22, -0.76, -0.43, -0.14, 0.14, 0.43, 0.76, 1.22],
'10': [-1.28, -0.84, -0.52, -0.25, 0, 0.25, 0.52, 0.84, 1.28],
'11': [-1.34, -0.91, -0.6, -0.35, -0.11, 0.11, 0.35, 0.6, 0.91,
1.34],
'12': [-1.38, -0.97, -0.67, -0.43, -0.21, 0, 0.21, 0.43, 0.67,
0.97, 1.38],
'13': [-1.43, -1.02, -0.74, -0.5, -0.29, -0.1, 0.1, 0.29, 0.5,
0.74, 1.02, 1.43],
'14': [-1.47, -1.07, -0.79, -0.57, -0.37, -0.18, 0, 0.18, 0.37,
0.57, 0.79, 1.07, 1.47],
'15': [-1.5, -1.11, -0.84, -0.62, -0.43, -0.25, -0.08, 0.08,
0.25, 0.43, 0.62, 0.84, 1.11, 1.5],
'16': [-1.53, -1.15, -0.89, -0.67, -0.49, -0.32, -0.16, 0,
0.16, 0.32, 0.49, 0.67, 0.89, 1.15, 1.53],
'17': [-1.56, -1.19, -0.93, -0.72, -0.54, -0.38, -0.22, -0.07,
0.07, 0.22, 0.38, 0.54, 0.72, 0.93, 1.19, 1.56],
'18': [-1.59, -1.22, -0.97, -0.76, -0.59, -0.43, -0.28, -0.14,
0, 0.14, 0.28, 0.43, 0.59, 0.76, 0.97, 1.22, 1.59],
'19': [-1.62, -1.25, -1, -0.8, -0.63, -0.48, -0.34, -0.2,
-0.07, 0.07, 0.2, 0.34, 0.48, 0.63, 0.8, 1, 1.25, 1.62],
'20': [-1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25,
-0.13, 0, 0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04,
1.28, 1.64]
}
[docs]class DictionarySizeIsNotSupported(ValueError):
pass
[docs]class OverlapSpecifiedIsNotSmallerThanWindowSize(ValueError):
pass