From 002a75093c61b5c5966ea05820ad72ef3baed5ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Mon, 26 May 2025 14:07:43 +0200 Subject: [PATCH 01/13] add dynamic alphabet allocation --- aeon/distances/tests/test_symbolic_mindist.py | 53 +++++ .../collection/dictionary_based/_sfa_fast.py | 181 +++++++++++++----- .../collection/dictionary_based/_sfa_whole.py | 25 ++- 3 files changed, 211 insertions(+), 48 deletions(-) diff --git a/aeon/distances/tests/test_symbolic_mindist.py b/aeon/distances/tests/test_symbolic_mindist.py index 2d81e04ddb..35ab500fde 100644 --- a/aeon/distances/tests/test_symbolic_mindist.py +++ b/aeon/distances/tests/test_symbolic_mindist.py @@ -170,3 +170,56 @@ def test_sfa_whole_mindist(): assert mindist_sfa <= ed assert mindist_dft_sfa >= mindist_sfa # a tighter lower bound assert mindist_dft_sfa <= ed + + +def test_dynamic_alphabet_allocation(): + """Test the SFA Min-Distance function.""" + n_segments = 16 + alphabet_size = 256 + + X_train, _ = load_unit_test("TRAIN") + X_test, _ = load_unit_test("TEST") + + X_train = zscore(X_train.squeeze(), axis=1) + X_test = zscore(X_test.squeeze(), axis=1) + histogram_type = "equi-width" + + # print("Testing") + for alphabet_allocation_method in { + "linear_scale", + "log_scale", + "sqrt_scale", + }: + sfa = SFAWhole( + word_length=n_segments, + alphabet_size=alphabet_size, + binning_method=histogram_type, + learn_alphabet_sizes=True, + alphabet_allocation_method=alphabet_allocation_method, + variance=True, # True gives a tighter lower bound + norm=True, + ) + + X_train_words, X_train_dfts = sfa.fit_transform(X_train) + X_test_words, _ = sfa.transform(X_test) + + for i in range(min(X_train.shape[0], X_test.shape[0])): + X = X_train[i].reshape(1, -1) + Y = X_test[i].reshape(1, -1) + + # SFA Min-Distance + mindist_sfa = mindist_sfa_distance( + X_train_words[i], X_test_words[i], sfa.breakpoints + ) + + # DFT-SFA Min-Distance + mindist_dft_sfa = mindist_dft_sfa_distance( + X_train_dfts[i], X_test_words[i], sfa.breakpoints + ) + + # Euclidean Distance + ed = np.linalg.norm(X[0] - Y[0]) + + assert mindist_sfa <= ed + assert mindist_dft_sfa >= mindist_sfa # a tighter lower bound + assert mindist_dft_sfa <= ed diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 0244d73572..d281449ae4 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -41,6 +41,12 @@ "quantile", } +alphabet_allocation_methods = { + "linear_scale", + "log_scale", + "sqrt_scale", +} + simplefilter(action="ignore", category=NumbaPendingDeprecationWarning) simplefilter(action="ignore", category=NumbaTypeSafetyWarning) @@ -65,6 +71,12 @@ class SFAFast(BaseCollectionTransformer): Number of values to discretise each value to. window_size : int, default = 12 Size of window for sliding. Input series length for whole series transform. + learn_alphabet_sizes : boolean, default = False + If True, dynamic alphabet sizes are learned based on the variance of the Fourier + coefficients. + alphabet_allocation_method : str, default = None + The method used to learn the dynamic alphabet sizes. One of + {"linear_scale", "log_scale", "sqrt_scale"}. norm : boolean, default = False Mean normalise words by dropping first fourier coefficient. binning_method : str, default="equi-depth" @@ -143,6 +155,8 @@ def __init__( word_length=8, alphabet_size=4, window_size=12, + learn_alphabet_sizes=False, + alphabet_allocation_method=None, norm=False, binning_method="equi-depth", anova=False, @@ -170,6 +184,7 @@ def __init__( self.word_length = word_length self.alphabet_size = alphabet_size + self.alphabet_sizes = None self.window_size = window_size self.norm = norm @@ -192,11 +207,13 @@ def __init__( self.n_cases = 0 self.n_timepoints = 0 - self.letter_bits = 0 + self.letter_bits = None self.dilation = dilation self.first_difference = first_difference self.sampling_factor = sampling_factor + self.learn_alphabet_sizes = learn_alphabet_sizes + self.alphabet_allocation_method = alphabet_allocation_method # Feature selection part self.feature_selection = feature_selection @@ -230,6 +247,14 @@ def _fit_transform(self, X, y=None, return_bag_of_words=True): "Please set either variance or anova Fourier coefficient selection" ) + if self.learn_alphabet_sizes and ( + self.alphabet_allocation_method not in alphabet_allocation_methods + ): + raise ValueError( + "alphabet_allocation_method must be one of: ", + alphabet_allocation_methods, + ) + if self.binning_method not in binning_methods: raise TypeError("binning_method must be one of: ", binning_methods) @@ -259,13 +284,16 @@ def _fit_transform(self, X, y=None, return_bag_of_words=True): if (self.anova or self.variance) is True else self.word_length_actual ) + # make dft_length an even number (same number of reals and imags) self.dft_length = self.dft_length + self.dft_length % 2 self.word_length_actual = self.word_length_actual + self.word_length_actual % 2 self.support = np.arange(self.word_length_actual) - self.letter_bits = np.uint32(math.ceil(math.log2(self.alphabet_size))) - # self.word_bits = self.word_length_actual * self.letter_bits + + self.letter_bits = np.zeros(self.word_length_actual, dtype=np.uint32) + self.letter_bits[:] = np.uint32(math.ceil(math.log2(self.alphabet_size))) + X = X.squeeze(1) # subsample the samples @@ -538,10 +566,10 @@ def _binning(self, X, y=None): if self.variance: # determine variance - dft_variance = np.var(dft, axis=0) + self.dft_variance = np.var(dft, axis=0) # select word-length-many indices with the largest variance - self.support = np.argsort(-dft_variance)[: self.word_length_actual] + self.support = np.argsort(-self.dft_variance)[: self.word_length_actual] # sort remaining indices self.support = np.sort(self.support) @@ -569,6 +597,41 @@ def _binning(self, X, y=None): self.dft_length = np.max(self.support) + 1 self.dft_length = self.dft_length + self.dft_length % 2 # even + # learn alphabet sizes + if self.learn_alphabet_sizes: + if self.dft_variance is None: + self.dft_variance = np.var(dft, axis=0) + + symbols = np.log2(self.alphabet_size) + self.bit_budget = int(symbols * self.word_length) + + if self.alphabet_allocation_method == "linear_scale": + variance = np.sqrt(self.dft_variance[self.support]) + normed_scale = variance / variance.sum() + + elif self.alphabet_allocation_method == "sqrt_scale": + variance = np.sqrt(np.sqrt(self.dft_variance[self.support])) + normed_scale = variance / variance.sum() + + elif self.alphabet_allocation_method == "log_scale": + variance = np.log2(np.sqrt(self.dft_variance[self.support]) + 1) + normed_scale = variance / variance.sum() + + bit_arr_raw = np.floor(normed_scale * symbols * self.word_length).astype( + np.uint32 + ) + + # Use at most symbols+1 for each position + bit_arr = heal_array(bit_arr_raw, symbols + 1, self.bit_budget) + + self.alphabet_sizes = [int(2 ** bit_arr[i]) for i in range(len(bit_arr))] + self.letter_bits = np.array(bit_arr, dtype=np.uint32) + else: + # use the same alphabet size for all positions + self.alphabet_sizes = [self.alphabet_size for _ in range(self.word_length)] + + self.alphabet_sizes = np.array(self.alphabet_sizes) + if self.binning_method == "information-gain": return self._igb(dft, y) if self.binning_method == "information-gain-mae": @@ -597,30 +660,9 @@ def _k_bins_discretizer(self, dft): return breakpoints def _mcb(self, dft): - breakpoints = np.zeros((self.word_length_actual, self.alphabet_size)) - - dft = np.round(dft, 2) - for letter in range(self.word_length_actual): - column = np.sort(dft[:, letter]) - bin_index = 0 - - # use equi-depth binning - if self.binning_method == "equi-depth": - target_bin_depth = len(dft) / self.alphabet_size - - for bp in range(self.alphabet_size - 1): - bin_index += target_bin_depth - breakpoints[letter, bp] = column[int(bin_index)] - - # use equi-width binning aka equi-frequency binning - elif self.binning_method == "equi-width": - target_bin_width = (column[-1] - column[0]) / self.alphabet_size - - for bp in range(self.alphabet_size - 1): - breakpoints[letter, bp] = (bp + 1) * target_bin_width + column[0] - - breakpoints[:, self.alphabet_size - 1] = sys.float_info.max - return breakpoints + return mcb( + dft, self.alphabet_sizes, self.word_length_actual, self.binning_method + ) def _igb(self, dft, y): breakpoints = np.zeros((self.word_length_actual, self.alphabet_size)) @@ -695,7 +737,7 @@ def get_words(self): """ words = np.squeeze(self.words) return np.array( - [_get_chars(word, self.word_length, self.alphabet_size) for word in words] + [_get_chars(word, self.word_length, self.letter_bits) for word in words] ) def transform_words(self, X): @@ -723,7 +765,6 @@ def transform_words(self, X): self.inverse_sqrt_win_size, self.lower_bounding or self.lower_bounding_distances, self.word_length, - self.alphabet_size, self.breakpoints, ) @@ -775,17 +816,16 @@ def __setstate__(self, state): @njit(cache=True, fastmath=True) -def _get_chars(word, word_length, alphabet_size): +def _get_chars(word, word_length, letter_bits): chars = np.zeros(word_length, dtype=np.uint32) - letter_bits = int(np.log2(alphabet_size)) - mask = (1 << letter_bits) - 1 for i in range(word_length): # Extract the last bits + mask = (1 << letter_bits[i]) - 1 char = word & mask chars[-i - 1] = char # Right shift by to move to the next group of bits - word >>= letter_bits + word >>= letter_bits[i] return chars @@ -986,9 +1026,7 @@ def generate_words( needed_size += max(0, 2 * dfts.shape[1] - 5 * window_size) words = np.zeros((dfts.shape[0], needed_size), dtype=np.uint32) - - letter_bits = np.uint32(letter_bits) - word_bits = word_length * letter_bits # dfts.shape[2] * letter_bits + word_bits = np.uint32(np.sum(letter_bits)) # special case: binary breakpoints if breakpoints.shape[1] == 2: @@ -1005,7 +1043,7 @@ def generate_words( for a in prange(dfts.shape[0]): for i in range(word_length): # range(dfts.shape[2]): words[a, : dfts.shape[1]] = ( - words[a, : dfts.shape[1]] << letter_bits + words[a, : dfts.shape[1]] << letter_bits[a] ) | np.digitize(dfts[a, :, i], breakpoints[i], right=True) # add bigrams @@ -1091,7 +1129,6 @@ def _mft( ) transformed2 = transformed2 * inverse_sqrt_win_size - if lower_bounding: transformed2[:, :, 1::2] = transformed2[:, :, 1::2] * -1 @@ -1259,12 +1296,45 @@ def create_dict(feature_names, features_idx): return relevant_features +@njit(fastmath=True, cache=True, parallel=True) +def mcb(dft, alphabet_sizes, word_length_actual, binning_method): + max_alphabet_size = np.max(alphabet_sizes) + + breakpoints = np.zeros((word_length_actual, max_alphabet_size), dtype=np.float32) + breakpoints[:, :] = np.finfo(np.float32).max + dft = np.round(dft, 2) + + for letter in prange(word_length_actual): + curr_alphabet_size = alphabet_sizes[letter] + column = np.sort(dft[:, letter]) + bin_index = 0 + + # use equi-depth binning + if binning_method == "equi-depth": + target_bin_depth = len(dft) / curr_alphabet_size + + for bp in range(curr_alphabet_size - 1): + bin_index += target_bin_depth + breakpoints[letter, bp] = column[int(bin_index)] + + # use equi-width binning aka equi-frequency binning + elif binning_method == "equi-width": + target_bin_width = (column[-1] - column[0]) / curr_alphabet_size + + for bp in range(curr_alphabet_size - 1): + breakpoints[letter, bp] = (bp + 1) * target_bin_width + column[0] + + return breakpoints + + @njit(fastmath=True, cache=True) def shorten_words(words, amount, letter_bits): new_words = np.zeros((words.shape[0], words.shape[1]), dtype=np.uint32) # Unigrams - shift_len = amount * letter_bits + shift_len = np.sum( + letter_bits[:amount] + ) # this does not work for variable-alphabet size for j in prange(words.shape[1]): # shorten a word by set amount of letters new_words[:, j] = words[:, j] >> shift_len @@ -1291,7 +1361,6 @@ def _transform_words_case( inverse_sqrt_win_size, lower_bounding, word_length, - alphabet_size, breakpoints, ): dfts = _mft( @@ -1311,9 +1380,35 @@ def _transform_words_case( for x in prange(dfts.shape[0]): for window in prange(dfts.shape[1]): for i in prange(word_length): - for bp in range(alphabet_size): + for bp in range(breakpoints.shape[1]): if dfts[x, window, i] <= breakpoints[i][bp]: words[x, window, i] = bp break return words, dfts + + +@njit(fastmath=True, cache=True) +def heal_array(bit_array, max_val, budget): + bit_array = bit_array.copy() + + # cap values beyond max_val + for i in range(len(bit_array)): + bit_array[i] = min(max_val, bit_array[i]) + + if bit_array.sum() > budget: + # print("Error", bit_array, bit_array.sum(), budget, bit_array.sum() <= budget) + assert bit_array.sum() <= budget + + # heal the array to have the correct sum == budget + if bit_array.sum() != budget: + diff = budget - bit_array.sum() + while diff > 0: + for i in range(len(bit_array)): + if bit_array[i] < max_val: + bit_array[i] += 1 + diff -= 1 + if diff == 0: + break + + return bit_array diff --git a/aeon/transformations/collection/dictionary_based/_sfa_whole.py b/aeon/transformations/collection/dictionary_based/_sfa_whole.py index 2e9f3df86a..083701dddb 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_whole.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_whole.py @@ -14,12 +14,14 @@ class SFAWhole(SFAFast): """Symbolic Fourier Approximation (SFA) Transformer. A whole series transform for SFA which holds the lower bounding lemma, see [1]. + SOFA [2] extends on SFA by introducing variance based feature selection, and dynamic + alphabet sizes. It produces a significantly tighter lower bound. It is implemented as a wrapper for the SFA-Fast transformer, the latter implements subsequence-based SFA extraction. This wrapper reduces non-needed parameters, and sets some usefull defaults for - lower bounding. + the tightest possible lower bounding. Parameters ---------- @@ -27,9 +29,15 @@ class SFAWhole(SFAFast): Length of word to shorten window to (using DFT). alphabet_size : int, default = 4 Number of values to discretise each value to. + learn_alphabet_sizes : boolean, default = True + If True, dynamic alphabet sizes are learned based on the variance of the Fourier + coefficients. + alphabet_allocation_method : str, default = linear_scale + The method used to learn the dynamic alphabet sizes. One of + {"linear_scale", "log_scale", "sqrt_scale"}. norm : boolean, default = False Mean normalise words by dropping first fourier coefficient. - binning_method : str, default="equi-depth" + binning_method : str, default="equi-width" The binning method used to derive the breakpoints. One of {"equi-depth", "equi-width", "information-gain", "information-gain-mae", "kmeans", "quantile"}, variance : boolean, default = False @@ -51,9 +59,12 @@ class SFAWhole(SFAFast): References ---------- - .. [1] Schäfer, Patrick, and Mikael Högqvist. "SFA: a symbolic fourier approximation + .. [1] P. Schäfer, and M. Högqvist. "SFA: a symbolic fourier approximation and index for similarity search in high dimensional datasets." Proceedings of the 15th international conference on extending database technology. 2012. + .. [2] P. Schäfer, J. Brand, U. Leser, B. Peng and T. Palpanas, "Fast and Exact + Similarity Search in Less than a Blink of an Eye," in 2025 IEEE 41st International + Conference on Data Engineering (ICDE), Hong Kong, 2025, pp. 2464-2477. """ _tags = { @@ -66,8 +77,10 @@ def __init__( self, word_length=8, alphabet_size=4, + learn_alphabet_sizes=True, + alphabet_allocation_method="linear_scale", norm=True, - binning_method="equi-depth", + binning_method="equi-width", variance=True, sampling_factor=None, random_state=None, @@ -77,6 +90,8 @@ def __init__( word_length=word_length, alphabet_size=alphabet_size, norm=norm, + learn_alphabet_sizes=learn_alphabet_sizes, + alphabet_allocation_method=alphabet_allocation_method, binning_method=binning_method, variance=variance, sampling_factor=sampling_factor, @@ -92,7 +107,7 @@ def __init__( skip_grams=False, remove_repeat_words=False, return_sparse=False, - window_size=None, # set in fit + window_size=None, # set in fit - do not remove ) def _fit_transform(self, X, y=None): From 21dffb752c0a4e72b25ee537e9649b6b338633c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Mon, 26 May 2025 14:18:45 +0200 Subject: [PATCH 02/13] update default --- .../transformations/collection/dictionary_based/_sfa_whole.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_whole.py b/aeon/transformations/collection/dictionary_based/_sfa_whole.py index 083701dddb..51259cefef 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_whole.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_whole.py @@ -32,7 +32,7 @@ class SFAWhole(SFAFast): learn_alphabet_sizes : boolean, default = True If True, dynamic alphabet sizes are learned based on the variance of the Fourier coefficients. - alphabet_allocation_method : str, default = linear_scale + alphabet_allocation_method : str, default = sqrt_scale The method used to learn the dynamic alphabet sizes. One of {"linear_scale", "log_scale", "sqrt_scale"}. norm : boolean, default = False @@ -78,7 +78,7 @@ def __init__( word_length=8, alphabet_size=4, learn_alphabet_sizes=True, - alphabet_allocation_method="linear_scale", + alphabet_allocation_method="sqrt_scale", norm=True, binning_method="equi-width", variance=True, From 57c5b77e1dfc49fc1426125d06060f04fac3deb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Mon, 26 May 2025 14:36:36 +0200 Subject: [PATCH 03/13] fix test for availability of variable --- aeon/transformations/collection/dictionary_based/_sfa_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index d281449ae4..d75f575e7c 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -599,7 +599,7 @@ def _binning(self, X, y=None): # learn alphabet sizes if self.learn_alphabet_sizes: - if self.dft_variance is None: + if not hasattr(self, "dft_variance"): self.dft_variance = np.var(dft, axis=0) symbols = np.log2(self.alphabet_size) From bdffc39462f8bc8bdccbc55ef58782363677a9da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Mon, 26 May 2025 21:19:14 +0200 Subject: [PATCH 04/13] bugfix --- .../collection/dictionary_based/_sfa_fast.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index d75f575e7c..223a5324ef 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -598,7 +598,7 @@ def _binning(self, X, y=None): self.dft_length = self.dft_length + self.dft_length % 2 # even # learn alphabet sizes - if self.learn_alphabet_sizes: + if self.learn_alphabet_sizes and self.alphabet_allocation_method: if not hasattr(self, "dft_variance"): self.dft_variance = np.var(dft, axis=0) @@ -628,7 +628,9 @@ def _binning(self, X, y=None): self.letter_bits = np.array(bit_arr, dtype=np.uint32) else: # use the same alphabet size for all positions - self.alphabet_sizes = [self.alphabet_size for _ in range(self.word_length)] + self.alphabet_sizes = [ + self.alphabet_size for _ in range(self.word_length_actual) + ] self.alphabet_sizes = np.array(self.alphabet_sizes) @@ -830,7 +832,7 @@ def _get_chars(word, word_length, letter_bits): return chars -@njit(fastmath=True, cache=True) +@njit(fastmath=True, cache=True, parallel=True) def _binning_dft( X, window_size, @@ -1013,7 +1015,7 @@ def _get_phis(window_size, length): return phis -@njit(fastmath=True, cache=True) +@njit(fastmath=True, cache=True, parallel=True) def generate_words( dfts, bigrams, skip_grams, window_size, breakpoints, word_length, letter_bits ): @@ -1043,7 +1045,7 @@ def generate_words( for a in prange(dfts.shape[0]): for i in range(word_length): # range(dfts.shape[2]): words[a, : dfts.shape[1]] = ( - words[a, : dfts.shape[1]] << letter_bits[a] + words[a, : dfts.shape[1]] << letter_bits[i] ) | np.digitize(dfts[a, :, i], breakpoints[i], right=True) # add bigrams @@ -1203,7 +1205,7 @@ def create_feature_names(sfa_words): return feature_names -@njit(cache=True, fastmath=True) +@njit(cache=True, fastmath=True, parallel=True) def create_bag_none( X_index, breakpoints, n_cases, sfa_words, word_length, remove_repeat_words ): @@ -1251,7 +1253,7 @@ def create_bag_feature_selection( return all_win_words, relevant_features -@njit(cache=True, fastmath=True) +@njit(cache=True, fastmath=True, parallel=True) def create_bag_transform( X_index, feature_count, @@ -1327,7 +1329,7 @@ def mcb(dft, alphabet_sizes, word_length_actual, binning_method): return breakpoints -@njit(fastmath=True, cache=True) +@njit(fastmath=True, cache=True, parallel=True) def shorten_words(words, amount, letter_bits): new_words = np.zeros((words.shape[0], words.shape[1]), dtype=np.uint32) From 0ee4ec96ad1b4e831b3af775b6696edcb5694aa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Mon, 26 May 2025 22:02:23 +0200 Subject: [PATCH 05/13] bugfix --- aeon/transformations/collection/dictionary_based/_sfa_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 223a5324ef..89d6216a1e 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -1205,7 +1205,7 @@ def create_feature_names(sfa_words): return feature_names -@njit(cache=True, fastmath=True, parallel=True) +@njit(cache=True, fastmath=True) # TODO does not work parallel=True ?? def create_bag_none( X_index, breakpoints, n_cases, sfa_words, word_length, remove_repeat_words ): From 884b4b71137c1cc440b6fd22da2afafc0ec02d61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Fri, 30 May 2025 10:37:46 +0200 Subject: [PATCH 06/13] extend test case and fix sfafast --- aeon/distances/tests/test_symbolic_mindist.py | 1 + .../collection/dictionary_based/_sfa_fast.py | 32 +++++++++++-------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/aeon/distances/tests/test_symbolic_mindist.py b/aeon/distances/tests/test_symbolic_mindist.py index 35ab500fde..ebc615cf10 100644 --- a/aeon/distances/tests/test_symbolic_mindist.py +++ b/aeon/distances/tests/test_symbolic_mindist.py @@ -220,6 +220,7 @@ def test_dynamic_alphabet_allocation(): # Euclidean Distance ed = np.linalg.norm(X[0] - Y[0]) + assert np.mean(np.log2(sfa.alphabet_sizes)) == np.log2(alphabet_size) assert mindist_sfa <= ed assert mindist_dft_sfa >= mindist_sfa # a tighter lower bound assert mindist_dft_sfa <= ed diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 89d6216a1e..da09160bc0 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -607,22 +607,20 @@ def _binning(self, X, y=None): if self.alphabet_allocation_method == "linear_scale": variance = np.sqrt(self.dft_variance[self.support]) - normed_scale = variance / variance.sum() + normed_scale = variance / variance.mean() elif self.alphabet_allocation_method == "sqrt_scale": variance = np.sqrt(np.sqrt(self.dft_variance[self.support])) - normed_scale = variance / variance.sum() - + normed_scale = variance / variance.mean() elif self.alphabet_allocation_method == "log_scale": variance = np.log2(np.sqrt(self.dft_variance[self.support]) + 1) - normed_scale = variance / variance.sum() + normed_scale = variance / variance.mean() - bit_arr_raw = np.floor(normed_scale * symbols * self.word_length).astype( + # Use at most symbols+1 for each position, and check if sum is correct + bit_arr = np.ceil(normed_scale * np.log2(self.alphabet_size)).astype( np.uint32 ) - - # Use at most symbols+1 for each position - bit_arr = heal_array(bit_arr_raw, symbols + 1, self.bit_budget) + bit_arr = heal_array(bit_arr, symbols + 1, self.bit_budget) self.alphabet_sizes = [int(2 ** bit_arr[i]) for i in range(len(bit_arr))] self.letter_bits = np.array(bit_arr, dtype=np.uint32) @@ -1394,16 +1392,24 @@ def _transform_words_case( def heal_array(bit_array, max_val, budget): bit_array = bit_array.copy() - # cap values beyond max_val + # cap values beyond max_val and below 1 for i in range(len(bit_array)): bit_array[i] = min(max_val, bit_array[i]) + bit_array[i] = max(1, bit_array[i]) + # to large: heal the array to have the correct sum == budget if bit_array.sum() > budget: - # print("Error", bit_array, bit_array.sum(), budget, bit_array.sum() <= budget) - assert bit_array.sum() <= budget + diff = bit_array.sum() - budget + while diff > 0: + for i in range(len(bit_array) - 1, 0, -1): + if bit_array[i] > 1: + bit_array[i] -= 1 + diff -= 1 + if diff == 0: + break - # heal the array to have the correct sum == budget - if bit_array.sum() != budget: + # to small: heal the array to have the correct sum == budget + if bit_array.sum() < budget: diff = budget - bit_array.sum() while diff > 0: for i in range(len(bit_array)): From ebc632114e864a2b931a5460a41a270143df7e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Tue, 3 Jun 2025 16:52:31 +0200 Subject: [PATCH 07/13] change logic --- .../collection/dictionary_based/_sfa_fast.py | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index da09160bc0..ff37a5162b 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -616,11 +616,9 @@ def _binning(self, X, y=None): variance = np.log2(np.sqrt(self.dft_variance[self.support]) + 1) normed_scale = variance / variance.mean() - # Use at most symbols+1 for each position, and check if sum is correct - bit_arr = np.ceil(normed_scale * np.log2(self.alphabet_size)).astype( - np.uint32 - ) - bit_arr = heal_array(bit_arr, symbols + 1, self.bit_budget) + # Use at most symbols+2 for each position, and check if sum is correct + bit_arr = np.ceil(normed_scale * symbols).astype(np.uint32) + bit_arr = heal_array(bit_arr, symbols + 2, self.bit_budget) self.alphabet_sizes = [int(2 ** bit_arr[i]) for i in range(len(bit_arr))] self.letter_bits = np.array(bit_arr, dtype=np.uint32) @@ -1393,30 +1391,37 @@ def heal_array(bit_array, max_val, budget): bit_array = bit_array.copy() # cap values beyond max_val and below 1 - for i in range(len(bit_array)): - bit_array[i] = min(max_val, bit_array[i]) - bit_array[i] = max(1, bit_array[i]) - - # to large: heal the array to have the correct sum == budget - if bit_array.sum() > budget: - diff = bit_array.sum() - budget - while diff > 0: - for i in range(len(bit_array) - 1, 0, -1): - if bit_array[i] > 1: + bit_array = np.minimum(max_val, np.maximum(1, bit_array)) + current_sum = bit_array.sum() + + # too large: heal the array to have the correct sum == budget + if current_sum > budget: + diff = current_sum - budget + change = True + while (diff > 0) and change: + change = False + for i in range(len(bit_array) - 1, -1, -1): + if bit_array[i] >= 1: bit_array[i] -= 1 diff -= 1 + change = True if diff == 0: break - - # to small: heal the array to have the correct sum == budget - if bit_array.sum() < budget: - diff = budget - bit_array.sum() - while diff > 0: + assert diff == 0, "Cannot reduce sum to budget" + + # too small: heal the array to have the correct sum == budget + if current_sum < budget: + diff = budget - current_sum + change = True + while (diff > 0) and change: + change = False for i in range(len(bit_array)): if bit_array[i] < max_val: bit_array[i] += 1 diff -= 1 + change = True if diff == 0: break + assert diff == 0, "Cannot increase sum to budget" return bit_array From 598a0de4f62d2232114106db6f649a5863e9abdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Wed, 4 Jun 2025 12:41:30 +0200 Subject: [PATCH 08/13] refactor code --- .../collection/dictionary_based/_sfa_fast.py | 77 +++++++++---------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index ff37a5162b..0e641254db 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -606,19 +606,16 @@ def _binning(self, X, y=None): self.bit_budget = int(symbols * self.word_length) if self.alphabet_allocation_method == "linear_scale": - variance = np.sqrt(self.dft_variance[self.support]) + variance = self.dft_variance[self.support] normed_scale = variance / variance.mean() - elif self.alphabet_allocation_method == "sqrt_scale": - variance = np.sqrt(np.sqrt(self.dft_variance[self.support])) + variance = np.sqrt(self.dft_variance[self.support]) normed_scale = variance / variance.mean() elif self.alphabet_allocation_method == "log_scale": - variance = np.log2(np.sqrt(self.dft_variance[self.support]) + 1) + variance = np.log2((self.dft_variance[self.support]) + 1) normed_scale = variance / variance.mean() - # Use at most symbols+2 for each position, and check if sum is correct - bit_arr = np.ceil(normed_scale * symbols).astype(np.uint32) - bit_arr = heal_array(bit_arr, symbols + 2, self.bit_budget) + bit_arr = assign_bits_dynamically(normed_scale, self.bit_budget) self.alphabet_sizes = [int(2 ** bit_arr[i]) for i in range(len(bit_arr))] self.letter_bits = np.array(bit_arr, dtype=np.uint32) @@ -1387,41 +1384,41 @@ def _transform_words_case( @njit(fastmath=True, cache=True) -def heal_array(bit_array, max_val, budget): - bit_array = bit_array.copy() +def assign_bits_dynamically(variance, budget, max_bit_val=9): + """Assign bits dynamically based on variance and budget. + + The goal is to maximize the variance covered by each symbol. + + Parameters + ---------- + variance : 1d numpy array + the variance for each position. + budget : float + the maximal number of bits to assign. + max_bit_val : int, optional, default=9 + the maximum number of bits that can be assigned to a single position. - # cap values beyond max_val and below 1 - bit_array = np.minimum(max_val, np.maximum(1, bit_array)) + Returns + ------- + bit_array : 1d numpy array + the number of bits assigned to each position. + """ + bit_array = np.zeros(len(variance), dtype=np.int32) + bit_array[:] = 0 + + improve = variance.copy() / 2.0 + + # assign bits to positions current_sum = bit_array.sum() + while current_sum < budget: + best_pos = np.argmax(improve) + bit_array[best_pos] += 1 + current_sum += 1 + + # recalculate the improvement + improve[best_pos] = variance[best_pos] / (2 ** (bit_array[best_pos] + 1)) - # too large: heal the array to have the correct sum == budget - if current_sum > budget: - diff = current_sum - budget - change = True - while (diff > 0) and change: - change = False - for i in range(len(bit_array) - 1, -1, -1): - if bit_array[i] >= 1: - bit_array[i] -= 1 - diff -= 1 - change = True - if diff == 0: - break - assert diff == 0, "Cannot reduce sum to budget" - - # too small: heal the array to have the correct sum == budget - if current_sum < budget: - diff = budget - current_sum - change = True - while (diff > 0) and change: - change = False - for i in range(len(bit_array)): - if bit_array[i] < max_val: - bit_array[i] += 1 - diff -= 1 - change = True - if diff == 0: - break - assert diff == 0, "Cannot increase sum to budget" + if bit_array[best_pos] == max_bit_val: + improve[best_pos] = 0 return bit_array From aaa741239f928901fd7cc015ea848bc2d39bab01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Wed, 4 Jun 2025 12:42:16 +0200 Subject: [PATCH 09/13] refactor code --- aeon/distances/tests/test_symbolic_mindist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/distances/tests/test_symbolic_mindist.py b/aeon/distances/tests/test_symbolic_mindist.py index ebc615cf10..ac34d5ec80 100644 --- a/aeon/distances/tests/test_symbolic_mindist.py +++ b/aeon/distances/tests/test_symbolic_mindist.py @@ -175,7 +175,7 @@ def test_sfa_whole_mindist(): def test_dynamic_alphabet_allocation(): """Test the SFA Min-Distance function.""" n_segments = 16 - alphabet_size = 256 + alphabet_size = 64 X_train, _ = load_unit_test("TRAIN") X_test, _ = load_unit_test("TEST") From 4204182ee66632497090b83298867dd94873b905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Wed, 4 Jun 2025 12:47:36 +0200 Subject: [PATCH 10/13] change default --- .../transformations/collection/dictionary_based/_sfa_whole.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_whole.py b/aeon/transformations/collection/dictionary_based/_sfa_whole.py index 51259cefef..083701dddb 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_whole.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_whole.py @@ -32,7 +32,7 @@ class SFAWhole(SFAFast): learn_alphabet_sizes : boolean, default = True If True, dynamic alphabet sizes are learned based on the variance of the Fourier coefficients. - alphabet_allocation_method : str, default = sqrt_scale + alphabet_allocation_method : str, default = linear_scale The method used to learn the dynamic alphabet sizes. One of {"linear_scale", "log_scale", "sqrt_scale"}. norm : boolean, default = False @@ -78,7 +78,7 @@ def __init__( word_length=8, alphabet_size=4, learn_alphabet_sizes=True, - alphabet_allocation_method="sqrt_scale", + alphabet_allocation_method="linear_scale", norm=True, binning_method="equi-width", variance=True, From a02b5f645f8dc4833fa2492496b2f5226dbac61e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Wed, 4 Jun 2025 12:56:07 +0200 Subject: [PATCH 11/13] tidy up code --- aeon/transformations/collection/dictionary_based/_sfa_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 0e641254db..8b14f9ccdc 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -1198,7 +1198,7 @@ def create_feature_names(sfa_words): return feature_names -@njit(cache=True, fastmath=True) # TODO does not work parallel=True ?? +@njit(cache=True, fastmath=True) # does not work with parallel=True ?? def create_bag_none( X_index, breakpoints, n_cases, sfa_words, word_length, remove_repeat_words ): From 3e9c1ea6de31c6bbd79afa9f83db7a02fbdb3d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Fri, 6 Jun 2025 12:19:55 +0200 Subject: [PATCH 12/13] updates --- .../collection/dictionary_based/_sfa_fast.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 8b14f9ccdc..d15efbfdb8 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -615,10 +615,10 @@ def _binning(self, X, y=None): variance = np.log2((self.dft_variance[self.support]) + 1) normed_scale = variance / variance.mean() - bit_arr = assign_bits_dynamically(normed_scale, self.bit_budget) - - self.alphabet_sizes = [int(2 ** bit_arr[i]) for i in range(len(bit_arr))] - self.letter_bits = np.array(bit_arr, dtype=np.uint32) + self.letter_bits = assign_bits_dynamically(normed_scale, self.bit_budget) + self.alphabet_sizes = [ + int(2 ** self.letter_bits[i]) for i in range(len(self.letter_bits)) + ] else: # use the same alphabet size for all positions self.alphabet_sizes = [ @@ -1403,7 +1403,7 @@ def assign_bits_dynamically(variance, budget, max_bit_val=9): bit_array : 1d numpy array the number of bits assigned to each position. """ - bit_array = np.zeros(len(variance), dtype=np.int32) + bit_array = np.zeros(len(variance), dtype=np.uint32) bit_array[:] = 0 improve = variance.copy() / 2.0 From 0e8cf701389f55af867543ec622b0e8e8ef27563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Sch=C3=A4fer?= Date: Mon, 30 Jun 2025 15:00:59 +0200 Subject: [PATCH 13/13] refactor --- aeon/distances/tests/test_symbolic_mindist.py | 1 - .../collection/dictionary_based/_sfa_fast.py | 14 +++++--------- .../collection/dictionary_based/_sfa_whole.py | 5 ----- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/aeon/distances/tests/test_symbolic_mindist.py b/aeon/distances/tests/test_symbolic_mindist.py index ac34d5ec80..c545b07140 100644 --- a/aeon/distances/tests/test_symbolic_mindist.py +++ b/aeon/distances/tests/test_symbolic_mindist.py @@ -194,7 +194,6 @@ def test_dynamic_alphabet_allocation(): word_length=n_segments, alphabet_size=alphabet_size, binning_method=histogram_type, - learn_alphabet_sizes=True, alphabet_allocation_method=alphabet_allocation_method, variance=True, # True gives a tighter lower bound norm=True, diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index de921304fe..1f8afe8db4 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -71,9 +71,6 @@ class SFAFast(BaseCollectionTransformer): Number of values to discretise each value to. window_size : int, default = 12 Size of window for sliding. Input series length for whole series transform. - learn_alphabet_sizes : boolean, default = False - If True, dynamic alphabet sizes are learned based on the variance of the Fourier - coefficients. alphabet_allocation_method : str, default = None The method used to learn the dynamic alphabet sizes. One of {"linear_scale", "log_scale", "sqrt_scale"}. @@ -84,12 +81,12 @@ class SFAFast(BaseCollectionTransformer): "equi-width", "information-gain", "information-gain-mae", "kmeans"}, anova : boolean, default = False If True, the Fourier coefficient selection is done via a one-way ANOVA test. - If False, the first Fourier coefficients are selected. Only applicable if - labels are given. + If False, the first Fourier coefficients are selected. + It is a supervised feature selection strategy. Only applicable if labels given. variance : boolean, default = False If True, the Fourier coefficient selection is done via the largest variance. - If False, the first Fourier coefficients are selected. Only applicable if - labels are given. + If False, the first Fourier coefficients are selected. + It is an unsupervised feature selection strategy. dilation : int, default = 0 When set to dilation > 1, adds dilation to the sliding window operation. save_words : boolean, default = False @@ -155,7 +152,6 @@ def __init__( word_length=8, alphabet_size=4, window_size=12, - learn_alphabet_sizes=False, alphabet_allocation_method=None, norm=False, binning_method="equi-depth", @@ -212,8 +208,8 @@ def __init__( self.dilation = dilation self.first_difference = first_difference self.sampling_factor = sampling_factor - self.learn_alphabet_sizes = learn_alphabet_sizes self.alphabet_allocation_method = alphabet_allocation_method + self.learn_alphabet_sizes = self.alphabet_allocation_method is not None # Feature selection part self.feature_selection = feature_selection diff --git a/aeon/transformations/collection/dictionary_based/_sfa_whole.py b/aeon/transformations/collection/dictionary_based/_sfa_whole.py index 083701dddb..0f7e4aad2a 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_whole.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_whole.py @@ -29,9 +29,6 @@ class SFAWhole(SFAFast): Length of word to shorten window to (using DFT). alphabet_size : int, default = 4 Number of values to discretise each value to. - learn_alphabet_sizes : boolean, default = True - If True, dynamic alphabet sizes are learned based on the variance of the Fourier - coefficients. alphabet_allocation_method : str, default = linear_scale The method used to learn the dynamic alphabet sizes. One of {"linear_scale", "log_scale", "sqrt_scale"}. @@ -77,7 +74,6 @@ def __init__( self, word_length=8, alphabet_size=4, - learn_alphabet_sizes=True, alphabet_allocation_method="linear_scale", norm=True, binning_method="equi-width", @@ -90,7 +86,6 @@ def __init__( word_length=word_length, alphabet_size=alphabet_size, norm=norm, - learn_alphabet_sizes=learn_alphabet_sizes, alphabet_allocation_method=alphabet_allocation_method, binning_method=binning_method, variance=variance,