From 8d21d5d361ed83cb6dd6ad76bbe570271cce0011 Mon Sep 17 00:00:00 2001 From: rflperry Date: Mon, 8 Mar 2021 22:33:27 -0500 Subject: [PATCH 01/10] working prototype --- proglearn/forest.py | 168 ++++++++++++++++++++++++++++--- proglearn/progressive_learner.py | 29 ++++-- proglearn/tests/test_forest.py | 63 +++++++++++- proglearn/transformers.py | 72 +++++++++++-- 4 files changed, 303 insertions(+), 29 deletions(-) diff --git a/proglearn/forest.py b/proglearn/forest.py index 80bac5ea3b..32f50dbd2f 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -35,6 +35,21 @@ class LifelongClassificationForest(ClassificationProgressiveLearner): The maximum depth of a tree in the Lifelong Classification Forest. This is used if 'max_depth' is not fed to add_task. + n_jobs : int, default=1 + The number of jobs to run in parallel. ``-1`` means use all + processors. + + max_samples : int or float, default=0.5 + The number of samples to draw from X (without replacement) to train + each tree. + - If None, then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + Note: The number of samples used to learn the tree will be further + reduced per the `tree_construction_proportion` value. + Attributes ---------- pl_ : ClassificationProgressiveLearner @@ -48,11 +63,15 @@ def __init__( default_tree_construction_proportion=0.67, default_kappa=np.inf, default_max_depth=30, + max_samples=None, + n_jobs=None, ): self.default_n_estimators = default_n_estimators self.default_tree_construction_proportion = default_tree_construction_proportion self.default_kappa = default_kappa self.default_max_depth = default_max_depth + self.max_samples = max_samples + self.n_jobs = n_jobs self.pl_ = ClassificationProgressiveLearner( default_transformer_class=TreeClassificationTransformer, @@ -61,6 +80,7 @@ def __init__( default_voter_kwargs={"kappa": default_kappa}, default_decider_class=SimpleArgmaxAverage, default_decider_kwargs={}, + n_jobs=n_jobs, ) def add_task( @@ -72,6 +92,8 @@ def add_task( tree_construction_proportion="default", kappa="default", max_depth="default", + transformer_kwargs={}, + max_samples=1.0, ): """ adds a task with id task_id, max tree depth max_depth, given input data matrix X @@ -103,15 +125,35 @@ def add_task( The coefficient for finite sample correction. The default is used if 'default' is provided. + TODO prune max_depth into transformer_kwargs max_depth : int or str, default='default' The maximum depth of a tree in the Lifelong Classification Forest. The default is used if 'default' is provided. + transformer_kwargs : dict, default={} + Additional named arguments to be passed to the transformer. + + n_jobs : int, default=1 + The number of jobs to run in parallel. ``-1`` means use all + processors. + + max_samples : int or float, default=0.5 + The number of samples to draw from X (without replacement) to train + each tree. + - If None, then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + Note: The number of samples used to learn the tree will be further + reduced per the `tree_construction_proportion` value. + Returns ------- self : LifelongClassificationForest The object itself. """ + # TODO get rid of defaults in favor of None if n_estimators == "default": n_estimators = self.default_n_estimators if tree_construction_proportion == "default": @@ -121,18 +163,28 @@ def add_task( if max_depth == "default": max_depth = self.default_max_depth + # TODO eliminate by subsuming max_depth + if not "fit_kwargs" in transformer_kwargs.keys(): + transformer_kwargs["fit_kwargs"] = {} + transformer_kwargs["fit_kwargs"]["max_depth"] = max_depth + X, y = check_X_y(X, y) + if isinstance(max_samples, int): + assert max_samples > 1 + max_samples = min(1, max_samples / X.shape[0]) + elif max_samples is None: + max_samples = 1.0 return self.pl_.add_task( X, y, task_id=task_id, transformer_voter_decider_split=[ - tree_construction_proportion, - 1 - tree_construction_proportion, + tree_construction_proportion * max_samples, + (1 - tree_construction_proportion) * max_samples, 0, ], num_transformers=n_estimators, - transformer_kwargs={"kwargs": {"max_depth": max_depth}}, + transformer_kwargs=transformer_kwargs, voter_kwargs={ "classes": np.unique(y), "kappa": kappa, @@ -232,7 +284,7 @@ def predict(self, X, task_id): class UncertaintyForest: """ - A class used to represent an uncertainty forest. + A class used to represent an Uncertainty Forest. Parameters ---------- @@ -243,31 +295,95 @@ class UncertaintyForest: The coefficient for finite sample correction. If set to the default value, finite sample correction is not performed. - max_depth : int, default=30 - The maximum depth of a tree in the UncertaintyForest + max_depth : int, default=None + The maximum depth of a tree in the UncertaintyForest. - tree_construction_proportion : float, default = 0.67 + tree_construction_proportion : float, default=0.5 The proportions of the input data set aside to train each decision tree. The remainder of the data is used to fill in voting posteriors. + max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + poisson_sampler : boolean, default=True + To match the GRF theory [#1grf]_, if True, the number of features + considered at each tree are drawn from a poisson distribution with + mean equal to `max_features`. + + n_jobs : int, default=1 + The number of jobs to run in parallel. ``-1`` means use all + processors. + + max_samples : int or float, default=0.5 + The number of samples to draw from X (without replacement) to train + each tree. + - If None, then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + Note: The number of samples used to learn the tree will be further + reduced per the `tree_construction_proportion` value. + + tree_kwargs : dict, default={} + Named arguments to be passed to each + sklearn.tree.DecisionTreeClassifier tree used in the construction + of the forest in addition to the above parameters. + Attributes ---------- + estimators_ : list of sklearn.tree.DecisionTreeClassifier + The collection of fitted trees. + lf_ : LifelongClassificationForest Internal LifelongClassificationForest used to train and make inference. + + n_features_ : int + The number of features when `fit` is performed. + + tree_kwargs_ : dict + Full set of keyword arguments passed to the Forest transformer. + + References + ---------- + .. [#1grf] Athey, Susan, Julie Tibshirani and Stefan Wager. + "Generalized Random Forests", Annals of Statistics, 2019. """ def __init__( self, n_estimators=100, kappa=np.inf, - max_depth=30, - tree_construction_proportion=0.67, + max_depth=None, + tree_construction_proportion=0.5, + max_features="auto", + poisson_sampler=True, + max_samples=0.5, + n_jobs=None, + tree_kwargs={}, ): self.n_estimators = n_estimators self.kappa = kappa self.max_depth = max_depth self.tree_construction_proportion = tree_construction_proportion + self.max_features = max_features + self.poisson_sampler = poisson_sampler + self.max_samples = max_samples + self.n_jobs = n_jobs + self.tree_kwargs = tree_kwargs def fit(self, X, y): """ @@ -286,6 +402,8 @@ def fit(self, X, y): self : UncertaintyForest The object itself. """ + X, y = check_X_y(X, y) + self.n_features_ = X.shape[1] self.lf_ = LifelongClassificationForest( default_n_estimators=self.n_estimators, default_kappa=self.kappa, @@ -293,8 +411,26 @@ def fit(self, X, y): default_tree_construction_proportion=self.tree_construction_proportion, ) - X, y = check_X_y(X, y) - return self.lf_.add_task(X, y, task_id=0) + self.tree_kwargs_ = { + "fit_kwargs": self.tree_kwargs, + "max_features": self.max_features, + "poisson_sampler": self.poisson_sampler, + } + self.lf_.add_task( + X, + y, + task_id=0, + transformer_kwargs=self.tree_kwargs_, + max_samples=self.max_samples, + ) + + return self + + @property + def estimators_(self): + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return [t.transformer_ for t in self.lf_.pl_.transformer_id_to_transformers[0]] def predict_proba(self, X): """ @@ -310,7 +446,10 @@ def predict_proba(self, X): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example """ - return self.lf_.predict_proba(check_array(X), 0) + X = check_array(X) + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return self.lf_.predict_proba(X, 0) def predict(self, X): """ @@ -326,4 +465,7 @@ def predict(self, X): y_hat : ndarray of shape [n_samples] predicted class label per example """ - return self.lf_.predict(check_array(X), 0) + X = check_array(X) + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return self.lf_.predict(X, 0) diff --git a/proglearn/progressive_learner.py b/proglearn/progressive_learner.py index 4bfecc7f9c..44ddd3dd15 100755 --- a/proglearn/progressive_learner.py +++ b/proglearn/progressive_learner.py @@ -3,6 +3,7 @@ Corresponding Email: levinewill@icloud.com """ import numpy as np +from joblib import Parallel, delayed from .base import BaseClassificationProgressiveLearner, BaseProgressiveLearner @@ -137,6 +138,10 @@ class ProgressiveLearner(BaseProgressiveLearner): default_decider_kwargs : dict Stores the default decider kwargs as specified by the parameter default_decider_kwargs. + + n_jobs : int, default=1 + The number of jobs to run in parallel when adding multiple + transformers per task. ``-1`` means use all processors. """ def __init__( @@ -147,6 +152,7 @@ def __init__( default_voter_kwargs=None, default_decider_class=None, default_decider_kwargs=None, + n_jobs=None, ): ( @@ -178,6 +184,8 @@ def __init__( self.default_decider_class = default_decider_class self.default_decider_kwargs = default_decider_kwargs + self.n_jobs = n_jobs + def get_transformer_ids(self): return np.array(list(self.transformer_id_to_transformers.keys())) @@ -498,14 +506,15 @@ def add_transformer( if transformer_id not in list(self.task_id_to_y.keys()): self.transformer_id_to_y[transformer_id] = y - # train new transformers - for transformer_num in range(num_transformers): - if X is not None: - n = len(X) - elif y is not None: - n = len(y) - else: - n = None + if X is not None: + n = len(X) + elif y is not None: + n = len(y) + else: + n = None + + def _train_new_transformer(transformer_num): + # train new transformers if n is not None: transformer_data_idx = np.random.choice( transformer_voter_data_idx, @@ -527,6 +536,10 @@ def add_transformer( voter_data_idx=voter_data_idx, ) + _ = Parallel(n_jobs=self.n_jobs)( + delayed(_train_new_transformer)(num) for num in range(num_transformers) + ) + # train voters and deciders from new transformer to previous tasks for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()): self.set_voter(transformer_id=transformer_id, task_id=existing_task_id) diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index acf7e9976b..17ba218673 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -3,7 +3,7 @@ import numpy as np import random -from proglearn.forest import LifelongClassificationForest +from proglearn.forest import LifelongClassificationForest, UncertaintyForest from proglearn.transformers import TreeClassificationTransformer from proglearn.voters import TreeClassificationVoter from proglearn.deciders import SimpleArgmaxAverage @@ -47,3 +47,64 @@ def test_correct_default_n_estimators(self): def test_correct_true_initilization_finite_sample_correction(self): l2f = LifelongClassificationForest(default_kappa=np.inf) assert l2f.pl_.default_voter_kwargs == {"kappa": np.inf} + + +# Test Uncertainty Forest + + +def test_uf_accuracy(): + uf = UncertaintyForest() + X = np.ones((20, 4)) + X[10:] *= -1 + y = [0] * 10 + [1] * 10 + uf = uf.fit(X, y) + np.testing.assert_array_equal(uf.predict(X), y) + + +@pytest.mark.parametrize("max_depth", [1, None]) +@pytest.mark.parametrize("max_features", [2, 0.5, "auto", "sqrt", "log2"]) +@pytest.mark.parametrize("poisson_sampler", [False, True]) +def test_decision_tree_params(max_depth, max_features, poisson_sampler): + uf = UncertaintyForest( + max_depth=max_depth, max_features=max_features, poisson_sampler=poisson_sampler + ) + X = np.ones((12, 20)) + X[6:] *= -1 + y = [0] * 6 + [1] * 6 + uf = uf.fit(X, y) + + assert uf.n_estimators == len(uf.estimators_) + depths = [est.max_depth for est in uf.estimators_] + assert all(np.asarray(depths) == max_depth) + + features = [est.max_features for est in uf.estimators_] + if poisson_sampler: + assert not all(np.asarray(features) == features[0]) + else: + assert all(np.asarray(features) == max_features) + + +def test_parallel_trees(): + uf = UncertaintyForest(n_jobs=2) + X = np.random.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = [0] * 50 + [1] * 50 + uf = uf.fit(X, y) + + +def test_max_samples(): + max_samples_list = [8, 0.5, None] + depths = [] + X = np.random.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = [0, 1] * 50 + for ms in max_samples_list: + uf = UncertaintyForest(n_estimators=1, max_samples=ms) + uf = uf.fit(X, y) + depths.append(uf.estimators_[0].get_depth()) + + assert all(np.diff(depths) > 0) + +# @pytest.mark.parametrize("signal_ranks", [None, 2]) +def test_uf_params(): + pass diff --git a/proglearn/transformers.py b/proglearn/transformers.py index 6af4d5fe8b..e647c1bfd7 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -2,6 +2,7 @@ Main Author: Will LeVine Corresponding Email: levinewill@icloud.com """ +import warnings import keras import numpy as np from sklearn.tree import DecisionTreeClassifier @@ -133,17 +134,48 @@ class TreeClassificationTransformer(BaseTransformer): Parameters ---------- - kwargs : dict, default={} - A dictionary to contain parameters of the tree. + max_features : {"auto", "sqrt", "log2"}, int or float, default=None + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + poisson_sampler : boolean, default=False + To match the GRF theory [#1grf]_, if True, the number of features + considered at each tree are drawn from a poisson distribution with + mean equal to `max_features`. + + fit_kwargs : dict, default={} + Named arguments passed to the sklearn.tree.DecisionTreeClassifier tree + created during `fit`. Attributes ---------- - transformer : sklearn.tree.DecisionTreeClassifier - an internal sklearn DecisionTreeClassifier + transformer_ : sklearn.tree.DecisionTreeClassifier + an internal sklearn.tree.DecisionTreeClassifier. + + n_features_ : int + The number of features of the data fitted. + + References + ---------- + .. [#1grf] Athey, Susan, Julie Tibshirani and Stefan Wager. + "Generalized Random Forests", Annals of Statistics, 2019. """ - def __init__(self, kwargs={}): - self.kwargs = kwargs + def __init__(self, max_features=1.0, poisson_sampler=False, fit_kwargs={}): + self.max_features = max_features + self.poisson_sampler = poisson_sampler + self.fit_kwargs = fit_kwargs def fit(self, X, y): """ @@ -162,7 +194,33 @@ def fit(self, X, y): The object itself. """ X, y = check_X_y(X, y) - self.transformer_ = DecisionTreeClassifier(**self.kwargs).fit(X, y) + self.n_features_ = X.shape[1] + if self.poisson_sampler: + if self.max_features in ("auto", "sqrt"): + max_features = np.sqrt(self.n_features_) + elif self.max_features == "log2": + max_features = np.log2(self.n_features_) + elif isinstance(self.max_features, float): + assert self.max_features > 0, self.max_features + max_features = self.max_features * self.n_features_ + elif isinstance(self.max_features, int): + assert self.max_features > 0, self.max_features + max_features = self.max_features + else: + raise ValueError(f"max_features value not an accepted value") + if max_features > self.n_features_: + warnings.warn( + "max_features value led to poisson mean " + + "({max_features}) > the number of features" + ) + max_features = int(max_features) + max_features = min(max(np.random.poisson(max_features), 1), self.n_features_) + else: + max_features = self.max_features + + self.transformer_ = DecisionTreeClassifier( + max_features=max_features, **self.fit_kwargs + ).fit(X, y) return self def transform(self, X): From 3d2dfdb3ca073f62503b359bb7a40f62d8da6e22 Mon Sep 17 00:00:00 2001 From: rflperry Date: Mon, 8 Mar 2021 22:42:59 -0500 Subject: [PATCH 02/10] update test --- proglearn/tests/test_forest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index 17ba218673..e543ae695a 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -105,6 +105,7 @@ def test_max_samples(): assert all(np.diff(depths) > 0) + # @pytest.mark.parametrize("signal_ranks", [None, 2]) def test_uf_params(): pass From 4dc355b13804253af99261e7584f7ee49e2a7430 Mon Sep 17 00:00:00 2001 From: rflperry Date: Wed, 10 Mar 2021 13:06:11 -0500 Subject: [PATCH 03/10] corrected parallel --- proglearn/forest.py | 13 +++---- proglearn/progressive_learner.py | 61 +++++++++++++++++++++++--------- proglearn/tests/test_forest.py | 40 ++++++++++++++------- 3 files changed, 78 insertions(+), 36 deletions(-) diff --git a/proglearn/forest.py b/proglearn/forest.py index 32f50dbd2f..028e9fddea 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -317,16 +317,16 @@ class UncertaintyForest: valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. - poisson_sampler : boolean, default=True + poisson_sampler : boolean, default=False To match the GRF theory [#1grf]_, if True, the number of features considered at each tree are drawn from a poisson distribution with mean equal to `max_features`. - n_jobs : int, default=1 + n_jobs : int, default=None The number of jobs to run in parallel. ``-1`` means use all - processors. + processors. None equates to 1. - max_samples : int or float, default=0.5 + max_samples : int or float, default=None The number of samples to draw from X (without replacement) to train each tree. - If None, then draw `X.shape[0]` samples. @@ -370,8 +370,8 @@ def __init__( max_depth=None, tree_construction_proportion=0.5, max_features="auto", - poisson_sampler=True, - max_samples=0.5, + poisson_sampler=False, + max_samples=None, n_jobs=None, tree_kwargs={}, ): @@ -409,6 +409,7 @@ def fit(self, X, y): default_kappa=self.kappa, default_max_depth=self.max_depth, default_tree_construction_proportion=self.tree_construction_proportion, + n_jobs=self.n_jobs, ) self.tree_kwargs_ = { diff --git a/proglearn/progressive_learner.py b/proglearn/progressive_learner.py index 44ddd3dd15..7ba9afcaf7 100755 --- a/proglearn/progressive_learner.py +++ b/proglearn/progressive_learner.py @@ -194,7 +194,8 @@ def get_task_ids(self): def _append_transformer(self, transformer_id, transformer): if transformer_id in self.get_transformer_ids(): - self.transformer_id_to_transformers[transformer_id].append(transformer) + self.transformer_id_to_transformers[transformer_id].append( + transformer) else: self.transformer_id_to_transformers[transformer_id] = [transformer] @@ -219,7 +220,8 @@ def _append_voter_data_idx(self, task_id, bag_id, voter_data_idx): if task_id in list(self.task_id_to_bag_id_to_voter_data_idx.keys()): self.task_id_to_bag_id_to_voter_data_idx[task_id][bag_id] = voter_data_idx else: - self.task_id_to_bag_id_to_voter_data_idx[task_id] = {bag_id: voter_data_idx} + self.task_id_to_bag_id_to_voter_data_idx[task_id] = { + bag_id: voter_data_idx} def _append_decider_idx(self, task_id, decider_idx): self.task_id_to_decider_idx[task_id] = decider_idx @@ -237,7 +239,8 @@ def _bifurcate_decider_idxs(self, ra, transformer_voter_decider_split): np.random.choice(ra, int(len(ra) * p), replace=False) for p in split ] else: - first_idx = np.random.choice(ra, int(len(ra) * split[0]), replace=False) + first_idx = np.random.choice( + ra, int(len(ra) * split[0]), replace=False) second_idx = np.random.choice( np.delete(ra, first_idx), int(len(ra) * split[1]), replace=False ) @@ -251,8 +254,11 @@ def set_transformer( transformer_data_idx=None, transformer_class=None, transformer_kwargs=None, + parallel=False, ): - + if transformer is not None and transformer.is_fitted() and parallel: + raise ValueError( + "Parallelization not implemented for fitted transformers") if transformer_id is None: transformer_id = len(self.get_transformer_ids()) @@ -270,7 +276,7 @@ def set_transformer( X, y = X[transformer_data_idx], y[transformer_data_idx] if X is None and y is None: - if transformer.is_fitted(): + if transformer is not None and transformer.is_fitted(): self._append_transformer(transformer_id, transformer) else: raise ValueError( @@ -299,13 +305,19 @@ def set_transformer( # Fit transformer and new voter if y is None: + transformer = transformer_class(**transformer_kwargs).fit(X) + if parallel: + return transformer self._append_transformer( - transformer_id, transformer_class(**transformer_kwargs).fit(X) + transformer_id, transformer ) else: # Type check y + transformer = transformer_class(**transformer_kwargs).fit(X, y) + if parallel: + return transformer self._append_transformer( - transformer_id, transformer_class(**transformer_kwargs).fit(X, y) + transformer_id, transformer ) def set_voter( @@ -316,7 +328,7 @@ def set_voter( voter_kwargs=None, bag_id=None, ): - + # TODO parallelize, at least for trees # Type check X # Type check y @@ -359,7 +371,8 @@ def set_voter( if bag_id is None: transformers = self.transformer_id_to_transformers[transformer_id] else: - transformers = [self.transformer_id_to_transformers[transformer_id][bag_id]] + transformers = [ + self.transformer_id_to_transformers[transformer_id][bag_id]] for transformer_num, transformer in enumerate(transformers): if transformer_id == task_id: voter_data_idx = self.task_id_to_bag_id_to_voter_data_idx[task_id][ @@ -513,42 +526,55 @@ def add_transformer( else: n = None + # transformer helper function def _train_new_transformer(transformer_num): # train new transformers if n is not None: transformer_data_idx = np.random.choice( transformer_voter_data_idx, int(transformer_data_proportion * n), - replace=False, + replace=False, # No bootstrapping ) else: transformer_data_idx = None + self.set_transformer( transformer_id=transformer_id, transformer_data_idx=transformer_data_idx, transformer_class=transformer_class, transformer_kwargs=transformer_kwargs, ) - voter_data_idx = np.delete(transformer_voter_data_idx, transformer_data_idx) + voter_data_idx = np.setdiff1d( + transformer_voter_data_idx, transformer_data_idx) + self._append_voter_data_idx( task_id=transformer_id, bag_id=transformer_num, voter_data_idx=voter_data_idx, ) - _ = Parallel(n_jobs=self.n_jobs)( + # Parallel loop over transformer training + Parallel(n_jobs=self.n_jobs)( delayed(_train_new_transformer)(num) for num in range(num_transformers) ) - # train voters and deciders from new transformer to previous tasks - for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()): - self.set_voter(transformer_id=transformer_id, task_id=existing_task_id) + # Voter and decider helper function + def _train_voters_deciders(existing_task_id): + self.set_voter(transformer_id=transformer_id, + task_id=existing_task_id) self.set_decider( task_id=existing_task_id, transformer_ids=list( - self.task_id_to_transformer_id_to_voters[existing_task_id].keys() + self.task_id_to_transformer_id_to_voters[existing_task_id].keys( + ) ), ) + + # train voters and deciders from new transformer to previous tasks + Parallel(n_jobs=self.n_jobs)( + delayed(_train_voters_deciders) + (existing_task_id) for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()) + ) return self @@ -693,7 +719,8 @@ def add_task( if num_transformers == 0: transformer_ids = forward_transformer_ids else: - transformer_ids = np.concatenate([forward_transformer_ids, task_id]) + transformer_ids = np.concatenate( + [forward_transformer_ids, task_id]) else: transformer_ids = self.get_transformer_ids() self.set_decider( diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index e543ae695a..766a60aae4 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -1,12 +1,15 @@ -import unittest -import pytest -import numpy as np -import random - -from proglearn.forest import LifelongClassificationForest, UncertaintyForest -from proglearn.transformers import TreeClassificationTransformer -from proglearn.voters import TreeClassificationVoter from proglearn.deciders import SimpleArgmaxAverage +from proglearn.voters import TreeClassificationVoter +from proglearn.transformers import TreeClassificationTransformer +from proglearn.forest import LifelongClassificationForest, UncertaintyForest +import random +import numpy as np +import time +import pytest +import unittest +import pprint +import sys +pprint.pprint(sys.path) class TestLifelongClassificationForest: @@ -85,11 +88,22 @@ def test_decision_tree_params(max_depth, max_features, poisson_sampler): def test_parallel_trees(): - uf = UncertaintyForest(n_jobs=2) - X = np.random.normal(0, 1, (100, 2)) - X[:50] *= -1 - y = [0] * 50 + [1] * 50 - uf = uf.fit(X, y) + uf = UncertaintyForest(n_estimators=500, n_jobs=1, + max_features=1, tree_construction_proportion=0.99) + uf_parallel = UncertaintyForest( + n_estimators=500, n_jobs=2, max_features=1, tree_construction_proportion=0.99) + X = np.arange(1000)[:, None] + y = [0, 1] * (len(X) // 2) + + time_start = time.time() + uf.fit(X, y) + time_diff = time.time() - time_start + + time_start = time.time() + uf_parallel.fit(X, y) + time_parallel_diff = time.time() - time_start + + assert time_parallel_diff * 1.25 < time_diff def test_max_samples(): From a6bfccccd1ba2dedf9c2edccef25257b4caacdd6 Mon Sep 17 00:00:00 2001 From: rflperry Date: Wed, 10 Mar 2021 15:31:27 -0500 Subject: [PATCH 04/10] fixed broken parallelism --- proglearn/progressive_learner.py | 25 +++++++++++++++++++------ proglearn/tests/test_forest.py | 8 ++++---- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/proglearn/progressive_learner.py b/proglearn/progressive_learner.py index 7ba9afcaf7..1d9be3f6f0 100755 --- a/proglearn/progressive_learner.py +++ b/proglearn/progressive_learner.py @@ -538,30 +538,40 @@ def _train_new_transformer(transformer_num): else: transformer_data_idx = None - self.set_transformer( + transformer = self.set_transformer( transformer_id=transformer_id, transformer_data_idx=transformer_data_idx, transformer_class=transformer_class, transformer_kwargs=transformer_kwargs, + parallel=True, ) voter_data_idx = np.setdiff1d( transformer_voter_data_idx, transformer_data_idx) + return transformer_num, transformer_id, transformer, voter_data_idx + + # Parallel loop over transformer training + ensemble = Parallel(n_jobs=self.n_jobs)( + delayed(_train_new_transformer)(num) for num in range(num_transformers) + ) + + for transformer_num, transformer_id, transformer, voter_data_idx in ensemble: + self._append_transformer( + transformer_id, transformer + ) self._append_voter_data_idx( task_id=transformer_id, bag_id=transformer_num, voter_data_idx=voter_data_idx, ) - # Parallel loop over transformer training - Parallel(n_jobs=self.n_jobs)( - delayed(_train_new_transformer)(num) for num in range(num_transformers) - ) + for num in range(num_transformers): + _train_new_transformer(num) # Voter and decider helper function def _train_voters_deciders(existing_task_id): self.set_voter(transformer_id=transformer_id, - task_id=existing_task_id) + task_id=existing_task_id) self.set_decider( task_id=existing_task_id, transformer_ids=list( @@ -570,6 +580,9 @@ def _train_voters_deciders(existing_task_id): ), ) + # for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()): + # _train_voters_deciders(existing_task_id) + # train voters and deciders from new transformer to previous tasks Parallel(n_jobs=self.n_jobs)( delayed(_train_voters_deciders) diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index 766a60aae4..648822d1f3 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -88,10 +88,10 @@ def test_decision_tree_params(max_depth, max_features, poisson_sampler): def test_parallel_trees(): - uf = UncertaintyForest(n_estimators=500, n_jobs=1, - max_features=1, tree_construction_proportion=0.99) + uf = UncertaintyForest(n_estimators=200, n_jobs=1, + max_features=1, tree_construction_proportion=0.999) uf_parallel = UncertaintyForest( - n_estimators=500, n_jobs=2, max_features=1, tree_construction_proportion=0.99) + n_estimators=200, n_jobs=2, max_features=1, tree_construction_proportion=0.999) X = np.arange(1000)[:, None] y = [0, 1] * (len(X) // 2) @@ -103,7 +103,7 @@ def test_parallel_trees(): uf_parallel.fit(X, y) time_parallel_diff = time.time() - time_start - assert time_parallel_diff * 1.25 < time_diff + assert time_parallel_diff / time_diff < 0.9 def test_max_samples(): From ef8d7a19c12ed36c64f0cf1151f481e8d97a1699 Mon Sep 17 00:00:00 2001 From: rflperry Date: Wed, 10 Mar 2021 21:20:58 -0500 Subject: [PATCH 05/10] optimized parallel --- proglearn/progressive_learner.py | 51 ++++++++++++++++++-------------- proglearn/tests/test_forest.py | 8 ++--- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/proglearn/progressive_learner.py b/proglearn/progressive_learner.py index 1d9be3f6f0..c1ef604c24 100755 --- a/proglearn/progressive_learner.py +++ b/proglearn/progressive_learner.py @@ -373,7 +373,9 @@ def set_voter( else: transformers = [ self.transformer_id_to_transformers[transformer_id][bag_id]] - for transformer_num, transformer in enumerate(transformers): + + # for transformer_num, transformer in enumerate(transformers): + def _parallel_helper(transformer_num, transformer, parallel=True): if transformer_id == task_id: voter_data_idx = self.task_id_to_bag_id_to_voter_data_idx[task_id][ transformer_num @@ -382,13 +384,28 @@ def set_voter( voter_data_idx = np.delete( range(len(X)), self.task_id_to_decider_idx[task_id] ) - self._append_voter( - transformer_id, - task_id, - voter_class(**voter_kwargs).fit( - transformer.transform(X[voter_data_idx]), y[voter_data_idx] - ), - ) + voter = voter_class(**voter_kwargs).fit( + transformer.transform(X[voter_data_idx]), y[voter_data_idx]) + + if parallel: + return task_id, transformer_id, voter + else: + self._append_voter( + transformer_id, + task_id, + voter_class(**voter_kwargs).fit( + transformer.transform(X[voter_data_idx]), y[voter_data_idx] + ), + ) + + # Parallel loop over voter training + # TODO Remove or fix. Tests show this causes drastic slowdown when in parallel + voter_info = Parallel(n_jobs=1)( + delayed(_parallel_helper)(transformer_num, transformer) + for transformer_num, transformer in enumerate(transformers) + ) + for transformer_id, task_id, voter in voter_info: + self._append_voter(transformer_id, task_id, voter) self.task_id_to_voter_class[task_id] = voter_class self.task_id_to_voter_kwargs[task_id] = voter_kwargs @@ -565,13 +582,10 @@ def _train_new_transformer(transformer_num): voter_data_idx=voter_data_idx, ) - for num in range(num_transformers): - _train_new_transformer(num) - - # Voter and decider helper function - def _train_voters_deciders(existing_task_id): + # train voters and deciders from new transformer to previous tasks + for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()): self.set_voter(transformer_id=transformer_id, - task_id=existing_task_id) + task_id=existing_task_id) self.set_decider( task_id=existing_task_id, transformer_ids=list( @@ -579,15 +593,6 @@ def _train_voters_deciders(existing_task_id): ) ), ) - - # for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()): - # _train_voters_deciders(existing_task_id) - - # train voters and deciders from new transformer to previous tasks - Parallel(n_jobs=self.n_jobs)( - delayed(_train_voters_deciders) - (existing_task_id) for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()) - ) return self diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index 648822d1f3..1907e12457 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -88,11 +88,11 @@ def test_decision_tree_params(max_depth, max_features, poisson_sampler): def test_parallel_trees(): - uf = UncertaintyForest(n_estimators=200, n_jobs=1, - max_features=1, tree_construction_proportion=0.999) + uf = UncertaintyForest(n_estimators=100, n_jobs=1, + max_features=1.0, tree_construction_proportion=0.5) uf_parallel = UncertaintyForest( - n_estimators=200, n_jobs=2, max_features=1, tree_construction_proportion=0.999) - X = np.arange(1000)[:, None] + n_estimators=100, n_jobs=10, max_features=1.0, tree_construction_proportion=0.5) + X = np.random.normal(0, 1, (1000, 100)) y = [0, 1] * (len(X) // 2) time_start = time.time() From 9ef96d80886d2638613dbadaa69d8d0e438cfd07 Mon Sep 17 00:00:00 2001 From: rflperry Date: Thu, 18 Mar 2021 15:00:27 -0400 Subject: [PATCH 06/10] sample weight parallelization --- proglearn/deciders.py | 23 ++++++++++- proglearn/forest.py | 1 + proglearn/progressive_learner.py | 69 +++++++++++++++++++++++--------- proglearn/transformers.py | 10 ++++- proglearn/voters.py | 9 +++-- 5 files changed, 85 insertions(+), 27 deletions(-) diff --git a/proglearn/deciders.py b/proglearn/deciders.py index 3d5c3412bb..c2c8e4be4e 100755 --- a/proglearn/deciders.py +++ b/proglearn/deciders.py @@ -123,6 +123,7 @@ def predict_proba(self, X, transformer_ids=None): """ check_is_fitted(self) vote_per_transformer_id = [] + prior_posterior_per_id = [] for transformer_id in ( transformer_ids if transformer_ids is not None @@ -130,18 +131,36 @@ def predict_proba(self, X, transformer_ids=None): ): check_is_fitted(self) vote_per_bag_id = [] + prior_posterior_per_bag = [] for bag_id in range( len(self.transformer_id_to_transformers_[transformer_id]) ): transformer = self.transformer_id_to_transformers_[transformer_id][ bag_id ] + # X.shape = (n_samples, n_features) X_transformed = transformer.transform(X) + # X_transformed.shape = (n_samples,) voter = self.transformer_id_to_voters_[transformer_id][bag_id] vote = voter.predict_proba(X_transformed) + # vote.shape = (n_samples, n_classes) vote_per_bag_id.append(vote) - vote_per_transformer_id.append(np.mean(vote_per_bag_id, axis=0)) - return np.mean(vote_per_transformer_id, axis=0) + + prior_posterior_per_bag.append(transformer.prior_posterior_) + # Each sample gets the average over transformers. Exclude all zeros in the mean + # vote_per_bag_id.shape = (n_transformers, n_samples, n_classes) + transformer_vote = np.sum(vote_per_bag_id, axis=0) + num_transformers = np.sum(vote_per_bag_id, axis=2).sum(axis=0)[:, None] + vote_per_transformer_id.append(transformer_vote / num_transformers) + prior_posterior_per_id.append(np.mean(prior_posterior_per_bag)) + + # vote_per_transformer_id.shape = (1, n_samples, n_classes) + predicted_posteriors = np.mean(vote_per_transformer_id, axis=0) + # Correction for samples not predicted by any tree + unknown_sample_indices = np.where(np.sum(predicted_posteriors, axis=1) == 0)[0] + predicted_posteriors[unknown_sample_indices] = np.mean(prior_posterior_per_id) + + return predicted_posteriors def predict(self, X, transformer_ids=None): """ diff --git a/proglearn/forest.py b/proglearn/forest.py index 028e9fddea..054876f06f 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -9,6 +9,7 @@ import numpy as np +from sklearn.ensemble import RandomForestClassifier from sklearn.utils.validation import check_X_y, check_array diff --git a/proglearn/progressive_learner.py b/proglearn/progressive_learner.py index c1ef604c24..4ef5d4a5c9 100755 --- a/proglearn/progressive_learner.py +++ b/proglearn/progressive_learner.py @@ -1,10 +1,11 @@ """ -Main Author: Will LeVine +Main Author: Will LeVine Corresponding Email: levinewill@icloud.com """ import numpy as np from joblib import Parallel, delayed from .base import BaseClassificationProgressiveLearner, BaseProgressiveLearner +from .transformers import TreeClassificationTransformer class ProgressiveLearner(BaseProgressiveLearner): @@ -256,9 +257,55 @@ def set_transformer( transformer_kwargs=None, parallel=False, ): + + if transformer_class is None: + if self.default_transformer_class is None: + raise ValueError( + "transformer_class is None and 'default_transformer_class' is None." + ) + else: + transformer_class = self.default_transformer_class + + if transformer_kwargs is None: + if self.default_transformer_kwargs is None: + raise ValueError( + """transformer_kwargs is None and + 'default_transformer_kwargs' is None.""" + ) + else: + transformer_kwargs = self.default_transformer_kwargs + if transformer is not None and transformer.is_fitted() and parallel: raise ValueError( "Parallelization not implemented for fitted transformers") + elif parallel and ( + transformer_class == TreeClassificationTransformer + ): + # Possible solution to not recreate memory arrays, See + # sklearn/ensemble/_forest.py#L176 + n_samples = ( + self.transformer_id_to_X[transformer_id].shape[0] + if transformer_id in list(self.transformer_id_to_X.keys()) + else self.task_id_to_X[transformer_id].shape[0] + ) + sample_weight = np.ones((n_samples,), dtype=np.float64) + sample_counts = np.bincount( + transformer_data_idx, minlength=n_samples) + sample_weight *= sample_counts + transformer = transformer_class( + sample_weight=sample_weight, **transformer_kwargs).fit( + ( + self.transformer_id_to_X[transformer_id] + if transformer_id in list(self.transformer_id_to_X.keys()) + else self.task_id_to_X[transformer_id] + ), + ( + self.transformer_id_to_y[transformer_id] + if transformer_id in list(self.transformer_id_to_y.keys()) + else self.task_id_to_y[transformer_id] + )) + return transformer + if transformer_id is None: transformer_id = len(self.get_transformer_ids()) @@ -286,23 +333,6 @@ def set_transformer( # Type check X - if transformer_class is None: - if self.default_transformer_class is None: - raise ValueError( - "transformer_class is None and 'default_transformer_class' is None." - ) - else: - transformer_class = self.default_transformer_class - - if transformer_kwargs is None: - if self.default_transformer_kwargs is None: - raise ValueError( - """transformer_kwargs is None and - 'default_transformer_kwargs' is None.""" - ) - else: - transformer_kwargs = self.default_transformer_kwargs - # Fit transformer and new voter if y is None: transformer = transformer_class(**transformer_kwargs).fit(X) @@ -394,7 +424,8 @@ def _parallel_helper(transformer_num, transformer, parallel=True): transformer_id, task_id, voter_class(**voter_kwargs).fit( - transformer.transform(X[voter_data_idx]), y[voter_data_idx] + transformer.transform( + X[voter_data_idx]), y[voter_data_idx] ), ) diff --git a/proglearn/transformers.py b/proglearn/transformers.py index e647c1bfd7..e9f42f1ead 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -154,6 +154,10 @@ class TreeClassificationTransformer(BaseTransformer): considered at each tree are drawn from a poisson distribution with mean equal to `max_features`. + sample_weight : array-like of shape (n_samples,), default=None + If None, all samples weighted equally. Used for efficient parallelization + by making non-sampled X,y weighted with 0. + fit_kwargs : dict, default={} Named arguments passed to the sklearn.tree.DecisionTreeClassifier tree created during `fit`. @@ -172,9 +176,10 @@ class TreeClassificationTransformer(BaseTransformer): "Generalized Random Forests", Annals of Statistics, 2019. """ - def __init__(self, max_features=1.0, poisson_sampler=False, fit_kwargs={}): + def __init__(self, max_features=1.0, poisson_sampler=False, sample_weight=None, fit_kwargs={}): self.max_features = max_features self.poisson_sampler = poisson_sampler + self.sample_weight = sample_weight self.fit_kwargs = fit_kwargs def fit(self, X, y): @@ -195,6 +200,7 @@ def fit(self, X, y): """ X, y = check_X_y(X, y) self.n_features_ = X.shape[1] + self.prior_posterior_ = np.bincount(y) / len(y) if self.poisson_sampler: if self.max_features in ("auto", "sqrt"): max_features = np.sqrt(self.n_features_) @@ -220,7 +226,7 @@ def fit(self, X, y): self.transformer_ = DecisionTreeClassifier( max_features=max_features, **self.fit_kwargs - ).fit(X, y) + ).fit(X, y, sample_weight=self.sample_weight) return self def transform(self, X): diff --git a/proglearn/voters.py b/proglearn/voters.py index 838f4bfdf6..2615aad96c 100755 --- a/proglearn/voters.py +++ b/proglearn/voters.py @@ -49,8 +49,8 @@ def fit(self, X, y): Parameters ---------- - X : array of shape [n_samples, n_features] - the transformed input data + X : array of shape [n_samples,] + Leaf indices each sample falls into y : array of shape [n_samples] the class labels @@ -70,6 +70,7 @@ def fit(self, X, y): self.missing_label_indices_.append(idx) self.uniform_posterior_ = np.ones(num_fit_classes) / num_fit_classes + # self.prior_posterior_ = np.bincount(y, minlength=len(self.classes)) / len(y) self.leaf_to_posterior_ = {} @@ -92,8 +93,8 @@ def predict_proba(self, X): Parameters ---------- - X : array of shape [n_samples, n_features] - the transformed input data + X : array of shape [n_samples,] + Indices of the leaf each sample falls into Returns ------- From d02308f34256b4d423ef769d649d50a21e582b18 Mon Sep 17 00:00:00 2001 From: rflperry Date: Thu, 18 Mar 2021 16:59:02 -0400 Subject: [PATCH 07/10] prior posteriors --- proglearn/deciders.py | 2 +- proglearn/forest.py | 70 ++++++++++++++++++++------------ proglearn/progressive_learner.py | 10 ++++- proglearn/voters.py | 39 ++++++++++++++---- 4 files changed, 85 insertions(+), 36 deletions(-) diff --git a/proglearn/deciders.py b/proglearn/deciders.py index c2c8e4be4e..48cd5143f5 100755 --- a/proglearn/deciders.py +++ b/proglearn/deciders.py @@ -89,6 +89,7 @@ def fit( self.classes = np.array(self.classes) self.transformer_id_to_transformers_ = transformer_id_to_transformers self.transformer_id_to_voters_ = transformer_id_to_voters + return self def predict_proba(self, X, transformer_ids=None): @@ -115,7 +116,6 @@ def predict_proba(self, X, transformer_ids=None): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example - Raises ------ NotFittedError diff --git a/proglearn/forest.py b/proglearn/forest.py index 054876f06f..4b176d8f62 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -51,6 +51,15 @@ class LifelongClassificationForest(ClassificationProgressiveLearner): Note: The number of samples used to learn the tree will be further reduced per the `tree_construction_proportion` value. + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty leaves during evaluation of a test + sample. If "ignore", trees in which the leaf is empty are not used in + the prediction. If "uniform", the prior tree posterior is 1/(number of + classes). If "empirical", the prior tree posterior is the relative + class frequency in the voting subsample. If all tree leaves are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. + Attributes ---------- pl_ : ClassificationProgressiveLearner @@ -66,6 +75,7 @@ def __init__( default_max_depth=30, max_samples=None, n_jobs=None, + honest_prior="ignore", ): self.default_n_estimators = default_n_estimators self.default_tree_construction_proportion = default_tree_construction_proportion @@ -73,6 +83,7 @@ def __init__( self.default_max_depth = default_max_depth self.max_samples = max_samples self.n_jobs = n_jobs + self.honest_prior = honest_prior self.pl_ = ClassificationProgressiveLearner( default_transformer_class=TreeClassificationTransformer, @@ -94,7 +105,6 @@ def add_task( kappa="default", max_depth="default", transformer_kwargs={}, - max_samples=1.0, ): """ adds a task with id task_id, max tree depth max_depth, given input data matrix X @@ -134,21 +144,6 @@ def add_task( transformer_kwargs : dict, default={} Additional named arguments to be passed to the transformer. - n_jobs : int, default=1 - The number of jobs to run in parallel. ``-1`` means use all - processors. - - max_samples : int or float, default=0.5 - The number of samples to draw from X (without replacement) to train - each tree. - - If None, then draw `X.shape[0]` samples. - - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0, 1)`. - - Note: The number of samples used to learn the tree will be further - reduced per the `tree_construction_proportion` value. - Returns ------- self : LifelongClassificationForest @@ -170,11 +165,13 @@ def add_task( transformer_kwargs["fit_kwargs"]["max_depth"] = max_depth X, y = check_X_y(X, y) - if isinstance(max_samples, int): - assert max_samples > 1 - max_samples = min(1, max_samples / X.shape[0]) - elif max_samples is None: + if isinstance(self.max_samples, int): + assert self.max_samples > 1 + max_samples = min(1, self.max_samples / X.shape[0]) + elif self.max_samples is None: max_samples = 1.0 + else: + max_samples = self.max_samples return self.pl_.add_task( X, y, @@ -187,8 +184,9 @@ def add_task( num_transformers=n_estimators, transformer_kwargs=transformer_kwargs, voter_kwargs={ - "classes": np.unique(y), - "kappa": kappa, + "classes" : np.unique(y), + "kappa" : kappa, + "honest_prior" : self.honest_prior }, decider_kwargs={"classes": np.unique(y)}, ) @@ -293,8 +291,8 @@ class UncertaintyForest: The number of trees in the UncertaintyForest kappa : float, default=np.inf - The coefficient for finite sample correction. - If set to the default value, finite sample correction is not performed. + The coefficient for finite sample correction. If set to the default + value, finite sample correction is not performed. max_depth : int, default=None The maximum depth of a tree in the UncertaintyForest. @@ -338,6 +336,15 @@ class UncertaintyForest: Note: The number of samples used to learn the tree will be further reduced per the `tree_construction_proportion` value. + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty leaves during evaluation of a test + sample. If "ignore", trees in which the leaf is empty are not used in + the prediction. If "uniform", the prior tree posterior is 1/(number of + classes). If "empirical", the prior tree posterior is the relative + class frequency in the voting subsample. If all tree leaves are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. + tree_kwargs : dict, default={} Named arguments to be passed to each sklearn.tree.DecisionTreeClassifier tree used in the construction @@ -348,6 +355,10 @@ class UncertaintyForest: estimators_ : list of sklearn.tree.DecisionTreeClassifier The collection of fitted trees. + voters_ : list of proglearn.voter.TreeClassificationVoter + The collection of honest voters for leaves in matching trees in + `self.estimators_` at the same index. + lf_ : LifelongClassificationForest Internal LifelongClassificationForest used to train and make inference. @@ -374,6 +385,7 @@ def __init__( poisson_sampler=False, max_samples=None, n_jobs=None, + honest_prior="ignore", tree_kwargs={}, ): self.n_estimators = n_estimators @@ -385,6 +397,7 @@ def __init__( self.max_samples = max_samples self.n_jobs = n_jobs self.tree_kwargs = tree_kwargs + self.honest_prior = honest_prior def fit(self, X, y): """ @@ -410,7 +423,9 @@ def fit(self, X, y): default_kappa=self.kappa, default_max_depth=self.max_depth, default_tree_construction_proportion=self.tree_construction_proportion, + max_samples=self.max_samples, n_jobs=self.n_jobs, + honest_prior=self.honest_prior, ) self.tree_kwargs_ = { @@ -423,7 +438,6 @@ def fit(self, X, y): y, task_id=0, transformer_kwargs=self.tree_kwargs_, - max_samples=self.max_samples, ) return self @@ -434,6 +448,12 @@ def estimators_(self): raise AttributeError("Model has not been fitted. Please fit first.") return [t.transformer_ for t in self.lf_.pl_.transformer_id_to_transformers[0]] + @property + def voters_(self): + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return [t for t in self.lf_.pl_.task_id_to_transformer_id_to_voters[0][0]] + def predict_proba(self, X): """ estimates class posteriors for each example in input data X. diff --git a/proglearn/progressive_learner.py b/proglearn/progressive_learner.py index 4ef5d4a5c9..f9aab814ae 100755 --- a/proglearn/progressive_learner.py +++ b/proglearn/progressive_learner.py @@ -43,6 +43,14 @@ class ProgressiveLearner(BaseProgressiveLearner): to the given string kwarg. This determines to which type of decider the progressive learner defaults if None is provided in any of the functions which add or set deciders. + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty paritions during evaluation of a test + sample. If "ignore", paritions in which the leaf is empty are not used in + the prediction. If "uniform", the prior posterior is 1/(number of + classes). If "empirical", the prior posterior is the relative + class frequency in the voting subsample. If all posteriors are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. Attributes ---------- @@ -73,7 +81,7 @@ class ProgressiveLearner(BaseProgressiveLearner): and values of type obj corresponding to a transformer. This dictionary thus maps transformer ids to the corresponding transformers. - task_id_to_trasnformer_id_to_voters : dict + task_id_to_transformer_id_to_voters : dict A nested dictionary with outer key of type obj, corresponding to task ids inner key of type obj, corresponding to transformer ids, and values of type obj, corresponding to a voter. This dictionary thus maps diff --git a/proglearn/voters.py b/proglearn/voters.py index 2615aad96c..04dcb24c9f 100755 --- a/proglearn/voters.py +++ b/proglearn/voters.py @@ -27,21 +27,34 @@ class TreeClassificationVoter(BaseClassificationVoter): classes : list, default=[] list of all possible output label values + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty leaves during evaluation of a test + sample. If "ignore", trees in which the leaf is empty are not used in + the prediction. If "uniform", the prior tree posterior is 1/(number of + classes). If "empirical", the prior tree posterior is the relative + class frequency in the voting subsample. If all tree leaves are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. + Attributes ---------- missing_label_indices_ : list a (potentially empty) list of label values that exist in the ``classes`` parameter but are missing in the latest ``fit`` function - call + call. + + prior_posterior_ : ndarray of shape (n_classes,) + The prior posterior associated with zero posteriors. - uniform_posterior_ : ndarray of shape (n_classes,) - the uniform posterior associated with the + num_fit_classes_ : int + Number of unique classes in the set of fitted labels. """ - def __init__(self, kappa=np.inf, classes=[]): + def __init__(self, kappa=np.inf, classes=[], honest_prior="ignore"): self.kappa = kappa self.classes = np.asarray(classes) + self.honest_prior = honest_prior def fit(self, X, y): """ @@ -61,16 +74,22 @@ def fit(self, X, y): """ check_classification_targets(y) - num_fit_classes = len(np.unique(y)) + self.num_fit_classes_ = len(np.unique(y)) self.missing_label_indices_ = [] - if self.classes.size != 0 and num_fit_classes < len(self.classes): + if self.classes.size != 0 and self.num_fit_classes_ < len(self.classes): for idx, label in enumerate(self.classes): if label not in np.unique(y): self.missing_label_indices_.append(idx) - self.uniform_posterior_ = np.ones(num_fit_classes) / num_fit_classes - # self.prior_posterior_ = np.bincount(y, minlength=len(self.classes)) / len(y) + if self.honest_prior == "uniform": + self.prior_posterior_ = np.ones(self.num_fit_classes_) / self.num_fit_classes_ + elif self.honest_prior in ("empirical", "ignore"): + self.prior_posterior_ = np.bincount( + y, minlength=len(self.classes)) / len(y) + else: + raise ValueError("honest_prior must be in " + + "{'ignore', 'uniform', 'empirical'}") self.leaf_to_posterior_ = {} @@ -111,8 +130,10 @@ def predict_proba(self, X): for x in X: if x in list(self.leaf_to_posterior_.keys()): votes_per_example.append(self.leaf_to_posterior_[x]) + elif self.honest_prior == "ignore": + votes_per_example.append(np.zeros(self.num_fit_classes_)) else: - votes_per_example.append(self.uniform_posterior_) + votes_per_example.append(self.prior_posterior_) votes_per_example = np.array(votes_per_example) From 3feeda586a377ed4750277fe6a5615ce93309415 Mon Sep 17 00:00:00 2001 From: rflperry Date: Thu, 18 Mar 2021 18:23:18 -0400 Subject: [PATCH 08/10] test honest prior --- proglearn/tests/test_forest.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index 1907e12457..00598bf456 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -119,6 +119,17 @@ def test_max_samples(): assert all(np.diff(depths) > 0) +@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"]) +def test_honest_prior(honest_prior): + X = np.random.normal(0, 1, (60, 2)) + X[:30] *= -1 + y = [0, 0, 1] * 20 + uf = UncertaintyForest(n_estimators=5, honest_prior=honest_prior) + uf = uf.fit(X, y) + if honest_prior == 'uniform': + assert all([len(set(voter.prior_posterior_)) == 1 for voter in uf.voters_]) + elif honest_prior in ('ignore', 'empirical'): + assert all([np.diff(voter.prior_posterior_) < 0 for voter in uf.voters_]) # @pytest.mark.parametrize("signal_ranks", [None, 2]) def test_uf_params(): From d2e6aeec147f90d280a7f77056620c9018e1fa24 Mon Sep 17 00:00:00 2001 From: rflperry Date: Wed, 24 Mar 2021 12:04:31 -0400 Subject: [PATCH 09/10] fixed ignore param, 0.63 samples --- proglearn/deciders.py | 10 ++++++---- proglearn/forest.py | 5 +++-- proglearn/tests/test_forest.py | 17 ++++++++++++++--- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/proglearn/deciders.py b/proglearn/deciders.py index 48cd5143f5..0f4f13c7c2 100755 --- a/proglearn/deciders.py +++ b/proglearn/deciders.py @@ -146,19 +146,21 @@ def predict_proba(self, X, transformer_ids=None): # vote.shape = (n_samples, n_classes) vote_per_bag_id.append(vote) - prior_posterior_per_bag.append(transformer.prior_posterior_) + prior_posterior_per_bag.append(voter.prior_posterior_) # Each sample gets the average over transformers. Exclude all zeros in the mean # vote_per_bag_id.shape = (n_transformers, n_samples, n_classes) transformer_vote = np.sum(vote_per_bag_id, axis=0) num_transformers = np.sum(vote_per_bag_id, axis=2).sum(axis=0)[:, None] - vote_per_transformer_id.append(transformer_vote / num_transformers) - prior_posterior_per_id.append(np.mean(prior_posterior_per_bag)) + vote_per_transformer_id.append(np.divide( + transformer_vote, num_transformers, out=np.zeros_like(transformer_vote), where=num_transformers!=0)) + + prior_posterior_per_id.append(np.mean(prior_posterior_per_bag, axis=0)) # vote_per_transformer_id.shape = (1, n_samples, n_classes) predicted_posteriors = np.mean(vote_per_transformer_id, axis=0) # Correction for samples not predicted by any tree unknown_sample_indices = np.where(np.sum(predicted_posteriors, axis=1) == 0)[0] - predicted_posteriors[unknown_sample_indices] = np.mean(prior_posterior_per_id) + predicted_posteriors[unknown_sample_indices] = np.mean(prior_posterior_per_id, axis=0) return predicted_posteriors diff --git a/proglearn/forest.py b/proglearn/forest.py index 4b176d8f62..75db56cd77 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -259,7 +259,8 @@ def predict_proba(self, X, task_id): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example """ - return self.pl_.predict_proba(check_array(X), task_id) + X = check_array(X) + return self.pl_.predict_proba(X, task_id) def predict(self, X, task_id): """ @@ -380,7 +381,7 @@ def __init__( n_estimators=100, kappa=np.inf, max_depth=None, - tree_construction_proportion=0.5, + tree_construction_proportion=0.63, max_features="auto", poisson_sampler=False, max_samples=None, diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index 00598bf456..7030435e40 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -131,6 +131,17 @@ def test_honest_prior(honest_prior): elif honest_prior in ('ignore', 'empirical'): assert all([np.diff(voter.prior_posterior_) < 0 for voter in uf.voters_]) -# @pytest.mark.parametrize("signal_ranks", [None, 2]) -def test_uf_params(): - pass + +@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"]) +def test_empty_leaves(honest_prior): + np.random.seed(0) + X = np.random.normal(0, 1, (100, 2)) + y = [0]*75 + [1]*25 + uf = UncertaintyForest(n_estimators=1, honest_prior=honest_prior, tree_construction_proportion=0.96, kappa=np.inf) + uf = uf.fit(X, y) + + y_proba = uf.predict_proba(X) + if honest_prior == 'uniform': + assert len(np.where(y_proba[:, 0] == 0.5)[0]) > 50 + elif honest_prior in ('ignore', 'empirical'): + assert len(np.where(y_proba[:, 0] == 0.75)[0]) > 50 From 2636515491be8a4834117bcf70444f8207ffebf0 Mon Sep 17 00:00:00 2001 From: rflperry Date: Fri, 26 Mar 2021 19:53:25 -0400 Subject: [PATCH 10/10] finite sample prior posterior --- proglearn/voters.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/proglearn/voters.py b/proglearn/voters.py index 04dcb24c9f..6908378640 100755 --- a/proglearn/voters.py +++ b/proglearn/voters.py @@ -184,10 +184,12 @@ def _finite_sample_correction(self, posteriors, num_points_in_partition, kappa): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example """ - correction_constant = 1 / (kappa * num_points_in_partition) + # correction_constant = 1 / (kappa * num_points_in_partition) - zero_posterior_idxs = np.where(posteriors == 0)[0] - posteriors[zero_posterior_idxs] = correction_constant + # zero_posterior_idxs = np.where(posteriors == 0)[0] + # posteriors[zero_posterior_idxs] = correction_constant + + posteriors += self.prior_posterior_ / (kappa * num_points_in_partition) posteriors /= sum(posteriors)