diff --git a/proglearn/deciders.py b/proglearn/deciders.py index 3d5c3412bb..0f4f13c7c2 100755 --- a/proglearn/deciders.py +++ b/proglearn/deciders.py @@ -89,6 +89,7 @@ def fit( self.classes = np.array(self.classes) self.transformer_id_to_transformers_ = transformer_id_to_transformers self.transformer_id_to_voters_ = transformer_id_to_voters + return self def predict_proba(self, X, transformer_ids=None): @@ -115,7 +116,6 @@ def predict_proba(self, X, transformer_ids=None): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example - Raises ------ NotFittedError @@ -123,6 +123,7 @@ def predict_proba(self, X, transformer_ids=None): """ check_is_fitted(self) vote_per_transformer_id = [] + prior_posterior_per_id = [] for transformer_id in ( transformer_ids if transformer_ids is not None @@ -130,18 +131,38 @@ def predict_proba(self, X, transformer_ids=None): ): check_is_fitted(self) vote_per_bag_id = [] + prior_posterior_per_bag = [] for bag_id in range( len(self.transformer_id_to_transformers_[transformer_id]) ): transformer = self.transformer_id_to_transformers_[transformer_id][ bag_id ] + # X.shape = (n_samples, n_features) X_transformed = transformer.transform(X) + # X_transformed.shape = (n_samples,) voter = self.transformer_id_to_voters_[transformer_id][bag_id] vote = voter.predict_proba(X_transformed) + # vote.shape = (n_samples, n_classes) vote_per_bag_id.append(vote) - vote_per_transformer_id.append(np.mean(vote_per_bag_id, axis=0)) - return np.mean(vote_per_transformer_id, axis=0) + + prior_posterior_per_bag.append(voter.prior_posterior_) + # Each sample gets the average over transformers. Exclude all zeros in the mean + # vote_per_bag_id.shape = (n_transformers, n_samples, n_classes) + transformer_vote = np.sum(vote_per_bag_id, axis=0) + num_transformers = np.sum(vote_per_bag_id, axis=2).sum(axis=0)[:, None] + vote_per_transformer_id.append(np.divide( + transformer_vote, num_transformers, out=np.zeros_like(transformer_vote), where=num_transformers!=0)) + + prior_posterior_per_id.append(np.mean(prior_posterior_per_bag, axis=0)) + + # vote_per_transformer_id.shape = (1, n_samples, n_classes) + predicted_posteriors = np.mean(vote_per_transformer_id, axis=0) + # Correction for samples not predicted by any tree + unknown_sample_indices = np.where(np.sum(predicted_posteriors, axis=1) == 0)[0] + predicted_posteriors[unknown_sample_indices] = np.mean(prior_posterior_per_id, axis=0) + + return predicted_posteriors def predict(self, X, transformer_ids=None): """ diff --git a/proglearn/forest.py b/proglearn/forest.py index 80bac5ea3b..75db56cd77 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -9,6 +9,7 @@ import numpy as np +from sklearn.ensemble import RandomForestClassifier from sklearn.utils.validation import check_X_y, check_array @@ -35,6 +36,30 @@ class LifelongClassificationForest(ClassificationProgressiveLearner): The maximum depth of a tree in the Lifelong Classification Forest. This is used if 'max_depth' is not fed to add_task. + n_jobs : int, default=1 + The number of jobs to run in parallel. ``-1`` means use all + processors. + + max_samples : int or float, default=0.5 + The number of samples to draw from X (without replacement) to train + each tree. + - If None, then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + Note: The number of samples used to learn the tree will be further + reduced per the `tree_construction_proportion` value. + + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty leaves during evaluation of a test + sample. If "ignore", trees in which the leaf is empty are not used in + the prediction. If "uniform", the prior tree posterior is 1/(number of + classes). If "empirical", the prior tree posterior is the relative + class frequency in the voting subsample. If all tree leaves are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. + Attributes ---------- pl_ : ClassificationProgressiveLearner @@ -48,11 +73,17 @@ def __init__( default_tree_construction_proportion=0.67, default_kappa=np.inf, default_max_depth=30, + max_samples=None, + n_jobs=None, + honest_prior="ignore", ): self.default_n_estimators = default_n_estimators self.default_tree_construction_proportion = default_tree_construction_proportion self.default_kappa = default_kappa self.default_max_depth = default_max_depth + self.max_samples = max_samples + self.n_jobs = n_jobs + self.honest_prior = honest_prior self.pl_ = ClassificationProgressiveLearner( default_transformer_class=TreeClassificationTransformer, @@ -61,6 +92,7 @@ def __init__( default_voter_kwargs={"kappa": default_kappa}, default_decider_class=SimpleArgmaxAverage, default_decider_kwargs={}, + n_jobs=n_jobs, ) def add_task( @@ -72,6 +104,7 @@ def add_task( tree_construction_proportion="default", kappa="default", max_depth="default", + transformer_kwargs={}, ): """ adds a task with id task_id, max tree depth max_depth, given input data matrix X @@ -103,15 +136,20 @@ def add_task( The coefficient for finite sample correction. The default is used if 'default' is provided. + TODO prune max_depth into transformer_kwargs max_depth : int or str, default='default' The maximum depth of a tree in the Lifelong Classification Forest. The default is used if 'default' is provided. + transformer_kwargs : dict, default={} + Additional named arguments to be passed to the transformer. + Returns ------- self : LifelongClassificationForest The object itself. """ + # TODO get rid of defaults in favor of None if n_estimators == "default": n_estimators = self.default_n_estimators if tree_construction_proportion == "default": @@ -121,21 +159,34 @@ def add_task( if max_depth == "default": max_depth = self.default_max_depth + # TODO eliminate by subsuming max_depth + if not "fit_kwargs" in transformer_kwargs.keys(): + transformer_kwargs["fit_kwargs"] = {} + transformer_kwargs["fit_kwargs"]["max_depth"] = max_depth + X, y = check_X_y(X, y) + if isinstance(self.max_samples, int): + assert self.max_samples > 1 + max_samples = min(1, self.max_samples / X.shape[0]) + elif self.max_samples is None: + max_samples = 1.0 + else: + max_samples = self.max_samples return self.pl_.add_task( X, y, task_id=task_id, transformer_voter_decider_split=[ - tree_construction_proportion, - 1 - tree_construction_proportion, + tree_construction_proportion * max_samples, + (1 - tree_construction_proportion) * max_samples, 0, ], num_transformers=n_estimators, - transformer_kwargs={"kwargs": {"max_depth": max_depth}}, + transformer_kwargs=transformer_kwargs, voter_kwargs={ - "classes": np.unique(y), - "kappa": kappa, + "classes" : np.unique(y), + "kappa" : kappa, + "honest_prior" : self.honest_prior }, decider_kwargs={"classes": np.unique(y)}, ) @@ -208,7 +259,8 @@ def predict_proba(self, X, task_id): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example """ - return self.pl_.predict_proba(check_array(X), task_id) + X = check_array(X) + return self.pl_.predict_proba(X, task_id) def predict(self, X, task_id): """ @@ -232,7 +284,7 @@ def predict(self, X, task_id): class UncertaintyForest: """ - A class used to represent an uncertainty forest. + A class used to represent an Uncertainty Forest. Parameters ---------- @@ -240,34 +292,113 @@ class UncertaintyForest: The number of trees in the UncertaintyForest kappa : float, default=np.inf - The coefficient for finite sample correction. - If set to the default value, finite sample correction is not performed. + The coefficient for finite sample correction. If set to the default + value, finite sample correction is not performed. - max_depth : int, default=30 - The maximum depth of a tree in the UncertaintyForest + max_depth : int, default=None + The maximum depth of a tree in the UncertaintyForest. - tree_construction_proportion : float, default = 0.67 + tree_construction_proportion : float, default=0.5 The proportions of the input data set aside to train each decision tree. The remainder of the data is used to fill in voting posteriors. + max_features : {"auto", "sqrt", "log2"}, int or float, default="auto" + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + poisson_sampler : boolean, default=False + To match the GRF theory [#1grf]_, if True, the number of features + considered at each tree are drawn from a poisson distribution with + mean equal to `max_features`. + + n_jobs : int, default=None + The number of jobs to run in parallel. ``-1`` means use all + processors. None equates to 1. + + max_samples : int or float, default=None + The number of samples to draw from X (without replacement) to train + each tree. + - If None, then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0, 1)`. + + Note: The number of samples used to learn the tree will be further + reduced per the `tree_construction_proportion` value. + + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty leaves during evaluation of a test + sample. If "ignore", trees in which the leaf is empty are not used in + the prediction. If "uniform", the prior tree posterior is 1/(number of + classes). If "empirical", the prior tree posterior is the relative + class frequency in the voting subsample. If all tree leaves are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. + + tree_kwargs : dict, default={} + Named arguments to be passed to each + sklearn.tree.DecisionTreeClassifier tree used in the construction + of the forest in addition to the above parameters. + Attributes ---------- + estimators_ : list of sklearn.tree.DecisionTreeClassifier + The collection of fitted trees. + + voters_ : list of proglearn.voter.TreeClassificationVoter + The collection of honest voters for leaves in matching trees in + `self.estimators_` at the same index. + lf_ : LifelongClassificationForest Internal LifelongClassificationForest used to train and make inference. + + n_features_ : int + The number of features when `fit` is performed. + + tree_kwargs_ : dict + Full set of keyword arguments passed to the Forest transformer. + + References + ---------- + .. [#1grf] Athey, Susan, Julie Tibshirani and Stefan Wager. + "Generalized Random Forests", Annals of Statistics, 2019. """ def __init__( self, n_estimators=100, kappa=np.inf, - max_depth=30, - tree_construction_proportion=0.67, + max_depth=None, + tree_construction_proportion=0.63, + max_features="auto", + poisson_sampler=False, + max_samples=None, + n_jobs=None, + honest_prior="ignore", + tree_kwargs={}, ): self.n_estimators = n_estimators self.kappa = kappa self.max_depth = max_depth self.tree_construction_proportion = tree_construction_proportion + self.max_features = max_features + self.poisson_sampler = poisson_sampler + self.max_samples = max_samples + self.n_jobs = n_jobs + self.tree_kwargs = tree_kwargs + self.honest_prior = honest_prior def fit(self, X, y): """ @@ -286,15 +417,43 @@ def fit(self, X, y): self : UncertaintyForest The object itself. """ + X, y = check_X_y(X, y) + self.n_features_ = X.shape[1] self.lf_ = LifelongClassificationForest( default_n_estimators=self.n_estimators, default_kappa=self.kappa, default_max_depth=self.max_depth, default_tree_construction_proportion=self.tree_construction_proportion, + max_samples=self.max_samples, + n_jobs=self.n_jobs, + honest_prior=self.honest_prior, ) - X, y = check_X_y(X, y) - return self.lf_.add_task(X, y, task_id=0) + self.tree_kwargs_ = { + "fit_kwargs": self.tree_kwargs, + "max_features": self.max_features, + "poisson_sampler": self.poisson_sampler, + } + self.lf_.add_task( + X, + y, + task_id=0, + transformer_kwargs=self.tree_kwargs_, + ) + + return self + + @property + def estimators_(self): + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return [t.transformer_ for t in self.lf_.pl_.transformer_id_to_transformers[0]] + + @property + def voters_(self): + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return [t for t in self.lf_.pl_.task_id_to_transformer_id_to_voters[0][0]] def predict_proba(self, X): """ @@ -310,7 +469,10 @@ def predict_proba(self, X): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example """ - return self.lf_.predict_proba(check_array(X), 0) + X = check_array(X) + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return self.lf_.predict_proba(X, 0) def predict(self, X): """ @@ -326,4 +488,7 @@ def predict(self, X): y_hat : ndarray of shape [n_samples] predicted class label per example """ - return self.lf_.predict(check_array(X), 0) + X = check_array(X) + if not hasattr(self, "lf_"): + raise AttributeError("Model has not been fitted. Please fit first.") + return self.lf_.predict(X, 0) diff --git a/proglearn/progressive_learner.py b/proglearn/progressive_learner.py index 4bfecc7f9c..f9aab814ae 100755 --- a/proglearn/progressive_learner.py +++ b/proglearn/progressive_learner.py @@ -1,9 +1,11 @@ """ -Main Author: Will LeVine +Main Author: Will LeVine Corresponding Email: levinewill@icloud.com """ import numpy as np +from joblib import Parallel, delayed from .base import BaseClassificationProgressiveLearner, BaseProgressiveLearner +from .transformers import TreeClassificationTransformer class ProgressiveLearner(BaseProgressiveLearner): @@ -41,6 +43,14 @@ class ProgressiveLearner(BaseProgressiveLearner): to the given string kwarg. This determines to which type of decider the progressive learner defaults if None is provided in any of the functions which add or set deciders. + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty paritions during evaluation of a test + sample. If "ignore", paritions in which the leaf is empty are not used in + the prediction. If "uniform", the prior posterior is 1/(number of + classes). If "empirical", the prior posterior is the relative + class frequency in the voting subsample. If all posteriors are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. Attributes ---------- @@ -71,7 +81,7 @@ class ProgressiveLearner(BaseProgressiveLearner): and values of type obj corresponding to a transformer. This dictionary thus maps transformer ids to the corresponding transformers. - task_id_to_trasnformer_id_to_voters : dict + task_id_to_transformer_id_to_voters : dict A nested dictionary with outer key of type obj, corresponding to task ids inner key of type obj, corresponding to transformer ids, and values of type obj, corresponding to a voter. This dictionary thus maps @@ -137,6 +147,10 @@ class ProgressiveLearner(BaseProgressiveLearner): default_decider_kwargs : dict Stores the default decider kwargs as specified by the parameter default_decider_kwargs. + + n_jobs : int, default=1 + The number of jobs to run in parallel when adding multiple + transformers per task. ``-1`` means use all processors. """ def __init__( @@ -147,6 +161,7 @@ def __init__( default_voter_kwargs=None, default_decider_class=None, default_decider_kwargs=None, + n_jobs=None, ): ( @@ -178,6 +193,8 @@ def __init__( self.default_decider_class = default_decider_class self.default_decider_kwargs = default_decider_kwargs + self.n_jobs = n_jobs + def get_transformer_ids(self): return np.array(list(self.transformer_id_to_transformers.keys())) @@ -186,7 +203,8 @@ def get_task_ids(self): def _append_transformer(self, transformer_id, transformer): if transformer_id in self.get_transformer_ids(): - self.transformer_id_to_transformers[transformer_id].append(transformer) + self.transformer_id_to_transformers[transformer_id].append( + transformer) else: self.transformer_id_to_transformers[transformer_id] = [transformer] @@ -211,7 +229,8 @@ def _append_voter_data_idx(self, task_id, bag_id, voter_data_idx): if task_id in list(self.task_id_to_bag_id_to_voter_data_idx.keys()): self.task_id_to_bag_id_to_voter_data_idx[task_id][bag_id] = voter_data_idx else: - self.task_id_to_bag_id_to_voter_data_idx[task_id] = {bag_id: voter_data_idx} + self.task_id_to_bag_id_to_voter_data_idx[task_id] = { + bag_id: voter_data_idx} def _append_decider_idx(self, task_id, decider_idx): self.task_id_to_decider_idx[task_id] = decider_idx @@ -229,7 +248,8 @@ def _bifurcate_decider_idxs(self, ra, transformer_voter_decider_split): np.random.choice(ra, int(len(ra) * p), replace=False) for p in split ] else: - first_idx = np.random.choice(ra, int(len(ra) * split[0]), replace=False) + first_idx = np.random.choice( + ra, int(len(ra) * split[0]), replace=False) second_idx = np.random.choice( np.delete(ra, first_idx), int(len(ra) * split[1]), replace=False ) @@ -243,8 +263,57 @@ def set_transformer( transformer_data_idx=None, transformer_class=None, transformer_kwargs=None, + parallel=False, ): + if transformer_class is None: + if self.default_transformer_class is None: + raise ValueError( + "transformer_class is None and 'default_transformer_class' is None." + ) + else: + transformer_class = self.default_transformer_class + + if transformer_kwargs is None: + if self.default_transformer_kwargs is None: + raise ValueError( + """transformer_kwargs is None and + 'default_transformer_kwargs' is None.""" + ) + else: + transformer_kwargs = self.default_transformer_kwargs + + if transformer is not None and transformer.is_fitted() and parallel: + raise ValueError( + "Parallelization not implemented for fitted transformers") + elif parallel and ( + transformer_class == TreeClassificationTransformer + ): + # Possible solution to not recreate memory arrays, See + # sklearn/ensemble/_forest.py#L176 + n_samples = ( + self.transformer_id_to_X[transformer_id].shape[0] + if transformer_id in list(self.transformer_id_to_X.keys()) + else self.task_id_to_X[transformer_id].shape[0] + ) + sample_weight = np.ones((n_samples,), dtype=np.float64) + sample_counts = np.bincount( + transformer_data_idx, minlength=n_samples) + sample_weight *= sample_counts + transformer = transformer_class( + sample_weight=sample_weight, **transformer_kwargs).fit( + ( + self.transformer_id_to_X[transformer_id] + if transformer_id in list(self.transformer_id_to_X.keys()) + else self.task_id_to_X[transformer_id] + ), + ( + self.transformer_id_to_y[transformer_id] + if transformer_id in list(self.transformer_id_to_y.keys()) + else self.task_id_to_y[transformer_id] + )) + return transformer + if transformer_id is None: transformer_id = len(self.get_transformer_ids()) @@ -262,7 +331,7 @@ def set_transformer( X, y = X[transformer_data_idx], y[transformer_data_idx] if X is None and y is None: - if transformer.is_fitted(): + if transformer is not None and transformer.is_fitted(): self._append_transformer(transformer_id, transformer) else: raise ValueError( @@ -272,32 +341,21 @@ def set_transformer( # Type check X - if transformer_class is None: - if self.default_transformer_class is None: - raise ValueError( - "transformer_class is None and 'default_transformer_class' is None." - ) - else: - transformer_class = self.default_transformer_class - - if transformer_kwargs is None: - if self.default_transformer_kwargs is None: - raise ValueError( - """transformer_kwargs is None and - 'default_transformer_kwargs' is None.""" - ) - else: - transformer_kwargs = self.default_transformer_kwargs - # Fit transformer and new voter if y is None: + transformer = transformer_class(**transformer_kwargs).fit(X) + if parallel: + return transformer self._append_transformer( - transformer_id, transformer_class(**transformer_kwargs).fit(X) + transformer_id, transformer ) else: # Type check y + transformer = transformer_class(**transformer_kwargs).fit(X, y) + if parallel: + return transformer self._append_transformer( - transformer_id, transformer_class(**transformer_kwargs).fit(X, y) + transformer_id, transformer ) def set_voter( @@ -308,7 +366,7 @@ def set_voter( voter_kwargs=None, bag_id=None, ): - + # TODO parallelize, at least for trees # Type check X # Type check y @@ -351,8 +409,11 @@ def set_voter( if bag_id is None: transformers = self.transformer_id_to_transformers[transformer_id] else: - transformers = [self.transformer_id_to_transformers[transformer_id][bag_id]] - for transformer_num, transformer in enumerate(transformers): + transformers = [ + self.transformer_id_to_transformers[transformer_id][bag_id]] + + # for transformer_num, transformer in enumerate(transformers): + def _parallel_helper(transformer_num, transformer, parallel=True): if transformer_id == task_id: voter_data_idx = self.task_id_to_bag_id_to_voter_data_idx[task_id][ transformer_num @@ -361,13 +422,29 @@ def set_voter( voter_data_idx = np.delete( range(len(X)), self.task_id_to_decider_idx[task_id] ) - self._append_voter( - transformer_id, - task_id, - voter_class(**voter_kwargs).fit( - transformer.transform(X[voter_data_idx]), y[voter_data_idx] - ), - ) + voter = voter_class(**voter_kwargs).fit( + transformer.transform(X[voter_data_idx]), y[voter_data_idx]) + + if parallel: + return task_id, transformer_id, voter + else: + self._append_voter( + transformer_id, + task_id, + voter_class(**voter_kwargs).fit( + transformer.transform( + X[voter_data_idx]), y[voter_data_idx] + ), + ) + + # Parallel loop over voter training + # TODO Remove or fix. Tests show this causes drastic slowdown when in parallel + voter_info = Parallel(n_jobs=1)( + delayed(_parallel_helper)(transformer_num, transformer) + for transformer_num, transformer in enumerate(transformers) + ) + for transformer_id, task_id, voter in voter_info: + self._append_voter(transformer_id, task_id, voter) self.task_id_to_voter_class[task_id] = voter_class self.task_id_to_voter_kwargs[task_id] = voter_kwargs @@ -498,29 +575,46 @@ def add_transformer( if transformer_id not in list(self.task_id_to_y.keys()): self.transformer_id_to_y[transformer_id] = y - # train new transformers - for transformer_num in range(num_transformers): - if X is not None: - n = len(X) - elif y is not None: - n = len(y) - else: - n = None + if X is not None: + n = len(X) + elif y is not None: + n = len(y) + else: + n = None + + # transformer helper function + def _train_new_transformer(transformer_num): + # train new transformers if n is not None: transformer_data_idx = np.random.choice( transformer_voter_data_idx, int(transformer_data_proportion * n), - replace=False, + replace=False, # No bootstrapping ) else: transformer_data_idx = None - self.set_transformer( + + transformer = self.set_transformer( transformer_id=transformer_id, transformer_data_idx=transformer_data_idx, transformer_class=transformer_class, transformer_kwargs=transformer_kwargs, + parallel=True, + ) + voter_data_idx = np.setdiff1d( + transformer_voter_data_idx, transformer_data_idx) + + return transformer_num, transformer_id, transformer, voter_data_idx + + # Parallel loop over transformer training + ensemble = Parallel(n_jobs=self.n_jobs)( + delayed(_train_new_transformer)(num) for num in range(num_transformers) + ) + + for transformer_num, transformer_id, transformer, voter_data_idx in ensemble: + self._append_transformer( + transformer_id, transformer ) - voter_data_idx = np.delete(transformer_voter_data_idx, transformer_data_idx) self._append_voter_data_idx( task_id=transformer_id, bag_id=transformer_num, @@ -529,11 +623,13 @@ def add_transformer( # train voters and deciders from new transformer to previous tasks for existing_task_id in np.intersect1d(backward_task_ids, self.get_task_ids()): - self.set_voter(transformer_id=transformer_id, task_id=existing_task_id) + self.set_voter(transformer_id=transformer_id, + task_id=existing_task_id) self.set_decider( task_id=existing_task_id, transformer_ids=list( - self.task_id_to_transformer_id_to_voters[existing_task_id].keys() + self.task_id_to_transformer_id_to_voters[existing_task_id].keys( + ) ), ) @@ -680,7 +776,8 @@ def add_task( if num_transformers == 0: transformer_ids = forward_transformer_ids else: - transformer_ids = np.concatenate([forward_transformer_ids, task_id]) + transformer_ids = np.concatenate( + [forward_transformer_ids, task_id]) else: transformer_ids = self.get_transformer_ids() self.set_decider( diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index acf7e9976b..7030435e40 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -1,12 +1,15 @@ -import unittest -import pytest -import numpy as np -import random - -from proglearn.forest import LifelongClassificationForest -from proglearn.transformers import TreeClassificationTransformer -from proglearn.voters import TreeClassificationVoter from proglearn.deciders import SimpleArgmaxAverage +from proglearn.voters import TreeClassificationVoter +from proglearn.transformers import TreeClassificationTransformer +from proglearn.forest import LifelongClassificationForest, UncertaintyForest +import random +import numpy as np +import time +import pytest +import unittest +import pprint +import sys +pprint.pprint(sys.path) class TestLifelongClassificationForest: @@ -47,3 +50,98 @@ def test_correct_default_n_estimators(self): def test_correct_true_initilization_finite_sample_correction(self): l2f = LifelongClassificationForest(default_kappa=np.inf) assert l2f.pl_.default_voter_kwargs == {"kappa": np.inf} + + +# Test Uncertainty Forest + + +def test_uf_accuracy(): + uf = UncertaintyForest() + X = np.ones((20, 4)) + X[10:] *= -1 + y = [0] * 10 + [1] * 10 + uf = uf.fit(X, y) + np.testing.assert_array_equal(uf.predict(X), y) + + +@pytest.mark.parametrize("max_depth", [1, None]) +@pytest.mark.parametrize("max_features", [2, 0.5, "auto", "sqrt", "log2"]) +@pytest.mark.parametrize("poisson_sampler", [False, True]) +def test_decision_tree_params(max_depth, max_features, poisson_sampler): + uf = UncertaintyForest( + max_depth=max_depth, max_features=max_features, poisson_sampler=poisson_sampler + ) + X = np.ones((12, 20)) + X[6:] *= -1 + y = [0] * 6 + [1] * 6 + uf = uf.fit(X, y) + + assert uf.n_estimators == len(uf.estimators_) + depths = [est.max_depth for est in uf.estimators_] + assert all(np.asarray(depths) == max_depth) + + features = [est.max_features for est in uf.estimators_] + if poisson_sampler: + assert not all(np.asarray(features) == features[0]) + else: + assert all(np.asarray(features) == max_features) + + +def test_parallel_trees(): + uf = UncertaintyForest(n_estimators=100, n_jobs=1, + max_features=1.0, tree_construction_proportion=0.5) + uf_parallel = UncertaintyForest( + n_estimators=100, n_jobs=10, max_features=1.0, tree_construction_proportion=0.5) + X = np.random.normal(0, 1, (1000, 100)) + y = [0, 1] * (len(X) // 2) + + time_start = time.time() + uf.fit(X, y) + time_diff = time.time() - time_start + + time_start = time.time() + uf_parallel.fit(X, y) + time_parallel_diff = time.time() - time_start + + assert time_parallel_diff / time_diff < 0.9 + + +def test_max_samples(): + max_samples_list = [8, 0.5, None] + depths = [] + X = np.random.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = [0, 1] * 50 + for ms in max_samples_list: + uf = UncertaintyForest(n_estimators=1, max_samples=ms) + uf = uf.fit(X, y) + depths.append(uf.estimators_[0].get_depth()) + + assert all(np.diff(depths) > 0) + +@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"]) +def test_honest_prior(honest_prior): + X = np.random.normal(0, 1, (60, 2)) + X[:30] *= -1 + y = [0, 0, 1] * 20 + uf = UncertaintyForest(n_estimators=5, honest_prior=honest_prior) + uf = uf.fit(X, y) + if honest_prior == 'uniform': + assert all([len(set(voter.prior_posterior_)) == 1 for voter in uf.voters_]) + elif honest_prior in ('ignore', 'empirical'): + assert all([np.diff(voter.prior_posterior_) < 0 for voter in uf.voters_]) + + +@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"]) +def test_empty_leaves(honest_prior): + np.random.seed(0) + X = np.random.normal(0, 1, (100, 2)) + y = [0]*75 + [1]*25 + uf = UncertaintyForest(n_estimators=1, honest_prior=honest_prior, tree_construction_proportion=0.96, kappa=np.inf) + uf = uf.fit(X, y) + + y_proba = uf.predict_proba(X) + if honest_prior == 'uniform': + assert len(np.where(y_proba[:, 0] == 0.5)[0]) > 50 + elif honest_prior in ('ignore', 'empirical'): + assert len(np.where(y_proba[:, 0] == 0.75)[0]) > 50 diff --git a/proglearn/transformers.py b/proglearn/transformers.py index 6af4d5fe8b..e9f42f1ead 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -2,6 +2,7 @@ Main Author: Will LeVine Corresponding Email: levinewill@icloud.com """ +import warnings import keras import numpy as np from sklearn.tree import DecisionTreeClassifier @@ -133,17 +134,53 @@ class TreeClassificationTransformer(BaseTransformer): Parameters ---------- - kwargs : dict, default={} - A dictionary to contain parameters of the tree. + max_features : {"auto", "sqrt", "log2"}, int or float, default=None + The number of features to consider when looking for the best split: + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + poisson_sampler : boolean, default=False + To match the GRF theory [#1grf]_, if True, the number of features + considered at each tree are drawn from a poisson distribution with + mean equal to `max_features`. + + sample_weight : array-like of shape (n_samples,), default=None + If None, all samples weighted equally. Used for efficient parallelization + by making non-sampled X,y weighted with 0. + + fit_kwargs : dict, default={} + Named arguments passed to the sklearn.tree.DecisionTreeClassifier tree + created during `fit`. Attributes ---------- - transformer : sklearn.tree.DecisionTreeClassifier - an internal sklearn DecisionTreeClassifier + transformer_ : sklearn.tree.DecisionTreeClassifier + an internal sklearn.tree.DecisionTreeClassifier. + + n_features_ : int + The number of features of the data fitted. + + References + ---------- + .. [#1grf] Athey, Susan, Julie Tibshirani and Stefan Wager. + "Generalized Random Forests", Annals of Statistics, 2019. """ - def __init__(self, kwargs={}): - self.kwargs = kwargs + def __init__(self, max_features=1.0, poisson_sampler=False, sample_weight=None, fit_kwargs={}): + self.max_features = max_features + self.poisson_sampler = poisson_sampler + self.sample_weight = sample_weight + self.fit_kwargs = fit_kwargs def fit(self, X, y): """ @@ -162,7 +199,34 @@ def fit(self, X, y): The object itself. """ X, y = check_X_y(X, y) - self.transformer_ = DecisionTreeClassifier(**self.kwargs).fit(X, y) + self.n_features_ = X.shape[1] + self.prior_posterior_ = np.bincount(y) / len(y) + if self.poisson_sampler: + if self.max_features in ("auto", "sqrt"): + max_features = np.sqrt(self.n_features_) + elif self.max_features == "log2": + max_features = np.log2(self.n_features_) + elif isinstance(self.max_features, float): + assert self.max_features > 0, self.max_features + max_features = self.max_features * self.n_features_ + elif isinstance(self.max_features, int): + assert self.max_features > 0, self.max_features + max_features = self.max_features + else: + raise ValueError(f"max_features value not an accepted value") + if max_features > self.n_features_: + warnings.warn( + "max_features value led to poisson mean " + + "({max_features}) > the number of features" + ) + max_features = int(max_features) + max_features = min(max(np.random.poisson(max_features), 1), self.n_features_) + else: + max_features = self.max_features + + self.transformer_ = DecisionTreeClassifier( + max_features=max_features, **self.fit_kwargs + ).fit(X, y, sample_weight=self.sample_weight) return self def transform(self, X): diff --git a/proglearn/voters.py b/proglearn/voters.py index 838f4bfdf6..6908378640 100755 --- a/proglearn/voters.py +++ b/proglearn/voters.py @@ -27,21 +27,34 @@ class TreeClassificationVoter(BaseClassificationVoter): classes : list, default=[] list of all possible output label values + honest_prior : {"ignore", "uniform", "empirical"}, default="ignore" + Method for dealing with empty leaves during evaluation of a test + sample. If "ignore", trees in which the leaf is empty are not used in + the prediction. If "uniform", the prior tree posterior is 1/(number of + classes). If "empirical", the prior tree posterior is the relative + class frequency in the voting subsample. If all tree leaves are empty, + "ignore" will use the empirical prior and the others will use their + respective priors. + Attributes ---------- missing_label_indices_ : list a (potentially empty) list of label values that exist in the ``classes`` parameter but are missing in the latest ``fit`` function - call + call. - uniform_posterior_ : ndarray of shape (n_classes,) - the uniform posterior associated with the + prior_posterior_ : ndarray of shape (n_classes,) + The prior posterior associated with zero posteriors. + + num_fit_classes_ : int + Number of unique classes in the set of fitted labels. """ - def __init__(self, kappa=np.inf, classes=[]): + def __init__(self, kappa=np.inf, classes=[], honest_prior="ignore"): self.kappa = kappa self.classes = np.asarray(classes) + self.honest_prior = honest_prior def fit(self, X, y): """ @@ -49,8 +62,8 @@ def fit(self, X, y): Parameters ---------- - X : array of shape [n_samples, n_features] - the transformed input data + X : array of shape [n_samples,] + Leaf indices each sample falls into y : array of shape [n_samples] the class labels @@ -61,15 +74,22 @@ def fit(self, X, y): """ check_classification_targets(y) - num_fit_classes = len(np.unique(y)) + self.num_fit_classes_ = len(np.unique(y)) self.missing_label_indices_ = [] - if self.classes.size != 0 and num_fit_classes < len(self.classes): + if self.classes.size != 0 and self.num_fit_classes_ < len(self.classes): for idx, label in enumerate(self.classes): if label not in np.unique(y): self.missing_label_indices_.append(idx) - self.uniform_posterior_ = np.ones(num_fit_classes) / num_fit_classes + if self.honest_prior == "uniform": + self.prior_posterior_ = np.ones(self.num_fit_classes_) / self.num_fit_classes_ + elif self.honest_prior in ("empirical", "ignore"): + self.prior_posterior_ = np.bincount( + y, minlength=len(self.classes)) / len(y) + else: + raise ValueError("honest_prior must be in " + + "{'ignore', 'uniform', 'empirical'}") self.leaf_to_posterior_ = {} @@ -92,8 +112,8 @@ def predict_proba(self, X): Parameters ---------- - X : array of shape [n_samples, n_features] - the transformed input data + X : array of shape [n_samples,] + Indices of the leaf each sample falls into Returns ------- @@ -110,8 +130,10 @@ def predict_proba(self, X): for x in X: if x in list(self.leaf_to_posterior_.keys()): votes_per_example.append(self.leaf_to_posterior_[x]) + elif self.honest_prior == "ignore": + votes_per_example.append(np.zeros(self.num_fit_classes_)) else: - votes_per_example.append(self.uniform_posterior_) + votes_per_example.append(self.prior_posterior_) votes_per_example = np.array(votes_per_example) @@ -162,10 +184,12 @@ def _finite_sample_correction(self, posteriors, num_points_in_partition, kappa): y_proba_hat : ndarray of shape [n_samples, n_classes] posteriors per example """ - correction_constant = 1 / (kappa * num_points_in_partition) + # correction_constant = 1 / (kappa * num_points_in_partition) + + # zero_posterior_idxs = np.where(posteriors == 0)[0] + # posteriors[zero_posterior_idxs] = correction_constant - zero_posterior_idxs = np.where(posteriors == 0)[0] - posteriors[zero_posterior_idxs] = correction_constant + posteriors += self.prior_posterior_ / (kappa * num_points_in_partition) posteriors /= sum(posteriors)