From 8c09f7fad193bdb853325ea618b63d2c80b144e0 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 16 Feb 2024 13:36:02 -0500 Subject: [PATCH 01/20] init split condition injection --- sklearn/tree/_splitter.pxd | 5 +++++ sklearn/tree/_splitter.pyx | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index f1434f5d05cc9..3169a9198d3f1 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,6 +19,8 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion +ctypedef bint (*SplitCondition)(Splitter*) + cdef struct SplitRecord: # Data to track sample split intp_t feature # Which feature to split on. @@ -112,6 +114,9 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst + cdef SplitCondition[:] pre_split_conditions + cdef SplitCondition[:] post_split_conditions + cdef int init( self, object X, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 1f781e55350d2..2352862e67f48 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -155,6 +155,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + SplitCondition[:] pre_split_conditions=[], + SplitCondition[:] post_split_conditions=[], *argv ): """ @@ -195,6 +197,9 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.pre_split_conditions = pre_split_conditions + self.post_split_conditions = post_split_conditions + def __reduce__(self): return (type(self), (self.criterion, self.max_features, From ecfc9b1d1e6f89c476dc2231d9cda3a484c456e9 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 16 Feb 2024 14:50:27 -0500 Subject: [PATCH 02/20] wip --- sklearn/tree/_splitter.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 3169a9198d3f1..04929e679b024 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,7 +19,7 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef bint (*SplitCondition)(Splitter*) +ctypedef bint (*SplitCondition)(Splitter splitter) cdef struct SplitRecord: # Data to track sample split From 0c3d5c0f2a1ac6c8ec8ab9a7fa8bb1af8e721797 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 16 Feb 2024 15:11:51 -0500 Subject: [PATCH 03/20] wip --- sklearn/tree/_splitter.pxd | 4 ++-- sklearn/tree/_splitter.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 04929e679b024..b8f8d9cfb19f4 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -114,8 +114,8 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef SplitCondition[:] pre_split_conditions - cdef SplitCondition[:] post_split_conditions + cdef SplitCondition[] pre_split_conditions + cdef SplitCondition[] post_split_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 2352862e67f48..beb0ebae3136d 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -155,8 +155,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitCondition[:] pre_split_conditions=[], - SplitCondition[:] post_split_conditions=[], + SplitCondition[] pre_split_conditions=[], + SplitCondition[] post_split_conditions=[], *argv ): """ From 5fd12a2c42db768aaffbd73801fe5e0a2b477089 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 20 Feb 2024 11:52:26 -0500 Subject: [PATCH 04/20] wip --- sklearn/tree/_splitter.pxd | 3 --- sklearn/tree/_splitter.pyx | 5 ----- 2 files changed, 8 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index b8f8d9cfb19f4..2e277e0b1d13f 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -114,9 +114,6 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef SplitCondition[] pre_split_conditions - cdef SplitCondition[] post_split_conditions - cdef int init( self, object X, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index beb0ebae3136d..1f781e55350d2 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -155,8 +155,6 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitCondition[] pre_split_conditions=[], - SplitCondition[] post_split_conditions=[], *argv ): """ @@ -197,9 +195,6 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - self.pre_split_conditions = pre_split_conditions - self.post_split_conditions = post_split_conditions - def __reduce__(self): return (type(self), (self.criterion, self.max_features, From b593ee024ad932a93bbc8fb2797a54a981c35604 Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 26 Feb 2024 19:09:10 -0500 Subject: [PATCH 05/20] injection progress --- sklearn/tree/_splitter.pxd | 9 ++++++++- sklearn/tree/_splitter.pyx | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 2e277e0b1d13f..3cd2d1dd3898a 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,7 +19,11 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef bint (*SplitCondition)(Splitter splitter) +ctypedef bint (*SplitCondition)(Splitter splitter) noexcept nogil + +cdef class SplitConditions: + cdef vector[SplitCondition] value + cdef struct SplitRecord: # Data to track sample split @@ -114,6 +118,9 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst + cdef public SplitConditions presplit_conditions + cdef public SplitConditions postsplit_conditions + cdef int init( self, object X, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 1f781e55350d2..260d571f71392 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -43,6 +43,23 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 +cdef bint condition1(Splitter splitter) noexcept nogil: + cdef bint bar = splitter.n_samples > 0 + + return 1 + +cdef class SplitConditions: + def __init__(self, n): + self.value.resize(n) + +def foo(): + presplit_conditions = SplitConditions(2) + presplit_conditions.value[0] = condition1 + presplit_conditions.value[1] = condition1 + + postsplit_conditions = SplitConditions(1) + postsplit_conditions = condition1 + cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY @@ -155,6 +172,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + SplitConditions presplit_conditions=None, + SplitConditions postsplit_conditions=None, *argv ): """ @@ -195,6 +214,9 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.presplit_conditions = presplit_conditions + self.postsplit_conditions = postsplit_conditions + def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -602,6 +624,11 @@ cdef inline intp_t node_split_best( n_right = end_non_missing - current_split.pos + n_missing if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue + + if splitter.presplit_conditions is not None: + for condition in splitter.presplit_conditions.value: + if condition(splitter): + continue criterion.update(current_split.pos) @@ -620,6 +647,11 @@ cdef inline intp_t node_split_best( # Reject if min_weight_leaf is not satisfied if splitter.check_postsplit_conditions() == 1: continue + + if splitter.postsplit_conditions is not None: + for condition in splitter.postsplit_conditions.value: + if condition(splitter): + continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 180fac32308195301e80d574b9b026fc66fece8b Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 27 Feb 2024 13:51:32 -0500 Subject: [PATCH 06/20] injection progress --- sklearn/tree/_splitter.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 260d571f71392..fd65568963a43 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -44,9 +44,7 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 cdef bint condition1(Splitter splitter) noexcept nogil: - cdef bint bar = splitter.n_samples > 0 - - return 1 + return splitter.n_samples > 0 cdef class SplitConditions: def __init__(self, n): @@ -58,7 +56,7 @@ def foo(): presplit_conditions.value[1] = condition1 postsplit_conditions = SplitConditions(1) - postsplit_conditions = condition1 + postsplit_conditions.value[0] = condition1 cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY From c207c3e220f6bf7bb699660da9a28a96834f01bc Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 27 Feb 2024 14:45:32 -0500 Subject: [PATCH 07/20] split injection refactoring --- sklearn/tree/_splitter.pxd | 7 ++----- sklearn/tree/_splitter.pyx | 34 ++++++++++++++-------------------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 3cd2d1dd3898a..37e3554f36dd4 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -21,9 +21,6 @@ from ._criterion cimport BaseCriterion, Criterion ctypedef bint (*SplitCondition)(Splitter splitter) noexcept nogil -cdef class SplitConditions: - cdef vector[SplitCondition] value - cdef struct SplitRecord: # Data to track sample split @@ -118,8 +115,8 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef public SplitConditions presplit_conditions - cdef public SplitConditions postsplit_conditions + cdef vector[SplitCondition] presplit_conditions + cdef vector[SplitCondition] postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index fd65568963a43..92c7a082283fe 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -46,17 +46,17 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 cdef bint condition1(Splitter splitter) noexcept nogil: return splitter.n_samples > 0 -cdef class SplitConditions: - def __init__(self, n): - self.value.resize(n) +cdef bint condition2(Splitter splitter) noexcept nogil: + return splitter.n_samples < 10 def foo(): - presplit_conditions = SplitConditions(2) - presplit_conditions.value[0] = condition1 - presplit_conditions.value[1] = condition1 + splitter = Splitter() + + splitter.presplit_conditions.push_back(condition1) + splitter.presplit_conditions.push_back(condition2) + + splitter.postsplit_conditions.push_back(condition1) - postsplit_conditions = SplitConditions(1) - postsplit_conditions.value[0] = condition1 cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: self.impurity_left = INFINITY @@ -170,8 +170,6 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitConditions presplit_conditions=None, - SplitConditions postsplit_conditions=None, *argv ): """ @@ -212,8 +210,6 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - self.presplit_conditions = presplit_conditions - self.postsplit_conditions = postsplit_conditions def __reduce__(self): return (type(self), (self.criterion, @@ -623,10 +619,9 @@ cdef inline intp_t node_split_best( if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue - if splitter.presplit_conditions is not None: - for condition in splitter.presplit_conditions.value: - if condition(splitter): - continue + for condition in splitter.presplit_conditions: + if condition(splitter): + continue criterion.update(current_split.pos) @@ -646,10 +641,9 @@ cdef inline intp_t node_split_best( if splitter.check_postsplit_conditions() == 1: continue - if splitter.postsplit_conditions is not None: - for condition in splitter.postsplit_conditions.value: - if condition(splitter): - continue + for condition in splitter.postsplit_conditions: + if condition(splitter): + continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 7cc71c10c49265cf581efb1637b17af142bb7d29 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 29 Feb 2024 11:04:19 -0800 Subject: [PATCH 08/20] added condition parameter passthrough prototype --- sklearn/tree/_splitter.pxd | 25 ++++++++++++++++++++++--- sklearn/tree/_splitter.pyx | 33 ++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 37e3554f36dd4..9eec9dd9afad8 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,7 +19,26 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef bint (*SplitCondition)(Splitter splitter) noexcept nogil +ctypedef void *SplitConditionParameters +ctypedef bint (*SplitCondition)(Splitter splitter, void* split_condition_parameters) noexcept nogil + +cdef struct SplitConditionTuple: + SplitCondition f + SplitConditionParameters p + +cdef struct DummyParameters: + int dummy + +cdef struct Condition1Parameters: + int some_number + +cdef inline bint condition1(Splitter splitter, void* split_condition_parameters) noexcept nogil: + cdef Condition1Parameters* p = split_condition_parameters + + return splitter.n_samples > 0 and p.some_number < 1000 + +cdef inline bint condition2(Splitter splitter, void* split_condition_parameters) noexcept nogil: + return splitter.n_samples < 10 cdef struct SplitRecord: @@ -115,8 +134,8 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef vector[SplitCondition] presplit_conditions - cdef vector[SplitCondition] postsplit_conditions + cdef vector[SplitConditionTuple] presplit_conditions + cdef vector[SplitConditionTuple] postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 92c7a082283fe..cc047ac605749 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -19,7 +19,7 @@ from cython cimport final from libc.math cimport isnan -from libc.stdlib cimport qsort +from libc.stdlib cimport qsort, malloc, free from libc.string cimport memcpy cimport numpy as cnp @@ -43,19 +43,26 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 -cdef bint condition1(Splitter splitter) noexcept nogil: - return splitter.n_samples > 0 +from ._tree cimport Tree +cdef class FooTree(Tree): + cdef Condition1Parameters* c1p + cdef DummyParameters* dummy_params -cdef bint condition2(Splitter splitter) noexcept nogil: - return splitter.n_samples < 10 + def __init__(self): + splitter = Splitter() + self.c1p = malloc(sizeof(Condition1Parameters)) + self.c1p.some_number = 5 -def foo(): - splitter = Splitter() + self.dummy_params = malloc(sizeof(DummyParameters)) - splitter.presplit_conditions.push_back(condition1) - splitter.presplit_conditions.push_back(condition2) - - splitter.postsplit_conditions.push_back(condition1) + splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) + splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) + + def __dealloc__(self): + if self.c1p is not NULL: + free(self.c1p) + if self.dummy_params is not NULL: + free(self.dummy_params) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: @@ -620,7 +627,7 @@ cdef inline intp_t node_split_best( continue for condition in splitter.presplit_conditions: - if condition(splitter): + if not condition.f(splitter, condition.p): continue criterion.update(current_split.pos) @@ -642,7 +649,7 @@ cdef inline intp_t node_split_best( continue for condition in splitter.postsplit_conditions: - if condition(splitter): + if not condition.f(splitter, condition.p): continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 2470d492c6cf52b5cad1bbeec7e272e56c4470cd Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 29 Feb 2024 11:32:42 -0800 Subject: [PATCH 09/20] some tidying --- sklearn/tree/_splitter.pxd | 21 ++++++++++++++++++--- sklearn/tree/_splitter.pyx | 15 +++++++-------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 9eec9dd9afad8..6b20fec2a56dc 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -13,6 +13,7 @@ cimport numpy as cnp from libcpp.vector cimport vector +from libc.stdlib cimport malloc from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t from ._utils cimport UINT32_t @@ -20,7 +21,7 @@ from ._criterion cimport BaseCriterion, Criterion ctypedef void *SplitConditionParameters -ctypedef bint (*SplitCondition)(Splitter splitter, void* split_condition_parameters) noexcept nogil +ctypedef bint (*SplitCondition)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil cdef struct SplitConditionTuple: SplitCondition f @@ -29,15 +30,29 @@ cdef struct SplitConditionTuple: cdef struct DummyParameters: int dummy +cdef inline DummyParameters* create_dummy_parameters(int dummy): + cdef DummyParameters* result = malloc(sizeof(DummyParameters)) + if result == NULL: + return NULL + result.dummy = dummy + return result + cdef struct Condition1Parameters: int some_number -cdef inline bint condition1(Splitter splitter, void* split_condition_parameters) noexcept nogil: +cdef inline Condition1Parameters* create_condition1_parameters(int some_number): + cdef Condition1Parameters* result = malloc(sizeof(Condition1Parameters)) + if result == NULL: + return NULL + result.some_number = some_number + return result + +cdef inline bint condition1(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: cdef Condition1Parameters* p = split_condition_parameters return splitter.n_samples > 0 and p.some_number < 1000 -cdef inline bint condition2(Splitter splitter, void* split_condition_parameters) noexcept nogil: +cdef inline bint condition2(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: return splitter.n_samples < 10 diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index cc047ac605749..d6d191462bff3 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -19,7 +19,7 @@ from cython cimport final from libc.math cimport isnan -from libc.stdlib cimport qsort, malloc, free +from libc.stdlib cimport qsort, free from libc.string cimport memcpy cimport numpy as cnp @@ -45,18 +45,17 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 from ._tree cimport Tree cdef class FooTree(Tree): + cdef Splitter splitter cdef Condition1Parameters* c1p cdef DummyParameters* dummy_params def __init__(self): - splitter = Splitter() - self.c1p = malloc(sizeof(Condition1Parameters)) - self.c1p.some_number = 5 + self.c1p = create_condition1_parameters(5) + self.dummy_params = create_dummy_parameters(0) - self.dummy_params = malloc(sizeof(DummyParameters)) - - splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) - splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) + self.splitter = Splitter() + self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) + self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) def __dealloc__(self): if self.c1p is not NULL: From ee3399faf3e2d01f0ccf05e3b7083fe7cbd287c6 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 29 Feb 2024 12:45:48 -0800 Subject: [PATCH 10/20] more tidying --- sklearn/tree/_splitter.pxd | 30 ++++++++++-------------------- sklearn/tree/_splitter.pyx | 16 ++++++---------- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 6b20fec2a56dc..1620d744d75c0 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -27,33 +27,23 @@ cdef struct SplitConditionTuple: SplitCondition f SplitConditionParameters p -cdef struct DummyParameters: - int dummy - -cdef inline DummyParameters* create_dummy_parameters(int dummy): - cdef DummyParameters* result = malloc(sizeof(DummyParameters)) - if result == NULL: - return NULL - result.dummy = dummy - return result +cdef inline bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + return splitter.n_samples < 10 -cdef struct Condition1Parameters: - int some_number +cdef struct AlphaRegularityParameters: + float64_t alpha -cdef inline Condition1Parameters* create_condition1_parameters(int some_number): - cdef Condition1Parameters* result = malloc(sizeof(Condition1Parameters)) +cdef inline AlphaRegularityParameters* create_alpha_regularity_parameters(float64_t alpha): + cdef AlphaRegularityParameters* result = malloc(sizeof(AlphaRegularityParameters)) if result == NULL: return NULL - result.some_number = some_number + result.alpha = alpha return result -cdef inline bint condition1(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - cdef Condition1Parameters* p = split_condition_parameters - - return splitter.n_samples > 0 and p.some_number < 1000 +cdef inline bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + cdef AlphaRegularityParameters* p = split_condition_parameters -cdef inline bint condition2(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - return splitter.n_samples < 10 + return 1 cdef struct SplitRecord: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d6d191462bff3..40c20dad96042 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -46,22 +46,18 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 from ._tree cimport Tree cdef class FooTree(Tree): cdef Splitter splitter - cdef Condition1Parameters* c1p - cdef DummyParameters* dummy_params + cdef AlphaRegularityParameters* p_alpha def __init__(self): - self.c1p = create_condition1_parameters(5) - self.dummy_params = create_dummy_parameters(0) + self.p_alpha = create_alpha_regularity_parameters(0.2) self.splitter = Splitter() - self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition1, self.c1p)) - self.splitter.presplit_conditions.push_back(SplitConditionTuple(condition2, self.dummy_params)) + self.splitter.presplit_conditions.push_back(SplitConditionTuple(alpha_regularity_condition, self.p_alpha)) + self.splitter.presplit_conditions.push_back(SplitConditionTuple(has_data_condition, NULL)) def __dealloc__(self): - if self.c1p is not NULL: - free(self.c1p) - if self.dummy_params is not NULL: - free(self.dummy_params) + if self.p_alpha is not NULL: + free(self.p_alpha) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: From a079e4fdac4f24367686bb1398dcfa6bc2d7d115 Mon Sep 17 00:00:00 2001 From: scarliles Date: Sat, 9 Mar 2024 22:12:39 -0500 Subject: [PATCH 11/20] splitter injection refactoring --- sklearn/tree/_splitter.pxd | 25 +++--------- sklearn/tree/_splitter.pyx | 80 ++++++++++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 37 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 1620d744d75c0..f552101ae40b2 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -20,30 +20,15 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion -ctypedef void *SplitConditionParameters -ctypedef bint (*SplitCondition)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil +ctypedef void* SplitConditionParameters +ctypedef bint (*SplitConditionFunction)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil cdef struct SplitConditionTuple: - SplitCondition f + SplitConditionFunction f SplitConditionParameters p -cdef inline bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - return splitter.n_samples < 10 - -cdef struct AlphaRegularityParameters: - float64_t alpha - -cdef inline AlphaRegularityParameters* create_alpha_regularity_parameters(float64_t alpha): - cdef AlphaRegularityParameters* result = malloc(sizeof(AlphaRegularityParameters)) - if result == NULL: - return NULL - result.alpha = alpha - return result - -cdef inline bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: - cdef AlphaRegularityParameters* p = split_condition_parameters - - return 1 +cdef class SplitCondition: + cdef SplitConditionTuple t cdef struct SplitRecord: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 40c20dad96042..22dbb995dd3f6 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -43,21 +43,56 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 + +cdef struct HasDataParameters: + int min_samples + +cdef bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + cdef HasDataParameters* p = split_condition_parameters + return splitter.n_samples >= p.min_samples + +cdef class HasDataCondition(SplitCondition): + def __cinit__(self, int min_samples): + self.t.f = has_data_condition + self.t.p = malloc(sizeof(HasDataParameters)) + (self.t.p).min_samples = min_samples + + def __dealloc__(self): + if self.t.p is not NULL: + free(self.t.p) + + super.__dealloc__(self) + +cdef struct AlphaRegularityParameters: + float64_t alpha + +cdef bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: + cdef AlphaRegularityParameters* p = split_condition_parameters + + return 1 + +cdef class AlphaRegularityCondition(SplitCondition): + def __cinit__(self, float64_t alpha): + self.t.f = alpha_regularity_condition + self.t.p = malloc(sizeof(AlphaRegularityParameters)) + (self.t.p).alpha = alpha + + def __dealloc__(self): + if self.t.p is not NULL: + free(self.t.p) + + super.__dealloc__(self) + + from ._tree cimport Tree cdef class FooTree(Tree): cdef Splitter splitter - cdef AlphaRegularityParameters* p_alpha def __init__(self): - self.p_alpha = create_alpha_regularity_parameters(0.2) - - self.splitter = Splitter() - self.splitter.presplit_conditions.push_back(SplitConditionTuple(alpha_regularity_condition, self.p_alpha)) - self.splitter.presplit_conditions.push_back(SplitConditionTuple(has_data_condition, NULL)) - - def __dealloc__(self): - if self.p_alpha is not NULL: - free(self.p_alpha) + self.splitter = Splitter( + presplit_conditions = [HasDataCondition(10)], + postsplit_conditions = [AlphaRegularityCondition(0.1)], + ) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: @@ -172,6 +207,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + SplitCondition[:] presplit_conditions, + SplitCondition[:] postsplit_conditions, *argv ): """ @@ -212,6 +249,14 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + if presplit_conditions is not None: + for condition in presplit_conditions: + self.presplit_conditions.push_back((condition).t) + + if postsplit_conditions is not None: + for condition in postsplit_conditions: + self.postsplit_conditions.push_back((condition).t) + def __reduce__(self): return (type(self), (self.criterion, @@ -618,13 +663,14 @@ cdef inline intp_t node_split_best( else: n_left = current_split.pos - splitter.start n_right = end_non_missing - current_split.pos + n_missing - if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: - continue - + for condition in splitter.presplit_conditions: if not condition.f(splitter, condition.p): continue + if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: + continue + criterion.update(current_split.pos) # Reject if monotonicity constraints are not satisfied @@ -639,14 +685,14 @@ cdef inline intp_t node_split_best( ): continue - # Reject if min_weight_leaf is not satisfied - if splitter.check_postsplit_conditions() == 1: - continue - for condition in splitter.postsplit_conditions: if not condition.f(splitter, condition.p): continue + # Reject if min_weight_leaf is not satisfied + if splitter.check_postsplit_conditions() == 1: + continue + current_proxy_improvement = criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: From 5397b666fe21025c113d30e8eb39c50556b0fca7 Mon Sep 17 00:00:00 2001 From: scarliles Date: Fri, 15 Mar 2024 17:46:16 -0400 Subject: [PATCH 12/20] cython injection due diligence, converted min_sample and monotonic_cst to injections --- sklearn/tree/_splitter.pxd | 22 ++++- sklearn/tree/_splitter.pyx | 191 +++++++++++++++++++++++++++++-------- 2 files changed, 173 insertions(+), 40 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index f552101ae40b2..9a400f3954b13 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -6,6 +6,7 @@ # Jacob Schreiber # Adam Li # Jong Shin +# Samuel Carliles # # License: BSD 3 clause @@ -20,8 +21,27 @@ from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion +# NICE IDEAS THAT DON'T APPEAR POSSIBLE +# - accessing elements of a memory view of cython extension types in a nogil block/function +# - storing cython extension types in cpp vectors +# +# despite the fact that we can access scalar extension type properties in such a context, +# as for instance node_split_best does with Criterion and Partition, +# and we can access the elements of a memory view of primitive types in such a context +# +# SO WHERE DOES THAT LEAVE US +# - we can transform these into cpp vectors of structs +# and with some minor casting irritations everything else works ok ctypedef void* SplitConditionParameters -ctypedef bint (*SplitConditionFunction)(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil +ctypedef bint (*SplitConditionFunction)( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil cdef struct SplitConditionTuple: SplitConditionFunction f diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 22dbb995dd3f6..bb21548ef4b31 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -44,10 +44,99 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 +cdef bint min_sample_leaf_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: + cdef intp_t min_samples_leaf = splitter.min_samples_leaf + cdef intp_t end_non_missing = splitter.end - n_missing + cdef intp_t n_left, n_right + + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + + # Reject if min_samples_leaf is not guaranteed + if n_left < min_samples_leaf or n_right < min_samples_leaf: + return 0 + + return 1 + +cdef class MinSamplesLeafCondition(SplitCondition): + def __cinit__(self): + self.t.f = min_sample_leaf_condition + self.t.p = NULL # min_samples is stored in splitter, which is already passed to f + +cdef bint min_weight_leaf_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: + cdef float64_t min_weight_leaf = splitter.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((splitter.criterion.weighted_n_left < min_weight_leaf) or + (splitter.criterion.weighted_n_right < min_weight_leaf)): + return 0 + + return 1 + +cdef class MinWeightLeafCondition(SplitCondition): + def __cinit__(self): + self.t.f = min_weight_leaf_condition + self.t.p = NULL # min_weight_leaf is stored in splitter, which is already passed to f + +cdef bint monotonic_constraint_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: + if ( + splitter.with_monotonic_cst and + splitter.monotonic_cst[current_split.feature] != 0 and + not splitter.criterion.check_monotonicity( + splitter.monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): + return 0 + + return 1 + +cdef class MonotonicConstraintCondition(SplitCondition): + def __cinit__(self): + self.t.f = monotonic_constraint_condition + self.t.p = NULL + cdef struct HasDataParameters: int min_samples -cdef bint has_data_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: +cdef bint has_data_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: cdef HasDataParameters* p = split_condition_parameters return splitter.n_samples >= p.min_samples @@ -66,7 +155,15 @@ cdef class HasDataCondition(SplitCondition): cdef struct AlphaRegularityParameters: float64_t alpha -cdef bint alpha_regularity_condition(Splitter splitter, SplitConditionParameters split_condition_parameters) noexcept nogil: +cdef bint alpha_regularity_condition( + Splitter splitter, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + float64_t lower_bound, + float64_t upper_bound, + SplitConditionParameters split_condition_parameters +) noexcept nogil: cdef AlphaRegularityParameters* p = split_condition_parameters return 1 @@ -249,14 +346,24 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None + self.min_samples_leaf_condition = MinSamplesLeafCondition() + self.min_weight_leaf_condition = MinWeightLeafCondition() + + self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) if presplit_conditions is not None: for condition in presplit_conditions: self.presplit_conditions.push_back((condition).t) + self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) if postsplit_conditions is not None: for condition in postsplit_conditions: self.postsplit_conditions.push_back((condition).t) + if(self.with_monotonic_cst): + self.monotonic_constraint_condition = MonotonicConstraintCondition() + self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) + self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) + def __reduce__(self): return (type(self), (self.criterion, @@ -644,54 +751,60 @@ cdef inline intp_t node_split_best( current_split.pos = p - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): - continue - - # Reject if min_samples_leaf is not guaranteed - if missing_go_to_left: - n_left = current_split.pos - splitter.start + n_missing - n_right = end_non_missing - current_split.pos - else: - n_left = current_split.pos - splitter.start - n_right = end_non_missing - current_split.pos + n_missing + # # Reject if monotonicity constraints are not satisfied + # if ( + # with_monotonic_cst and + # monotonic_cst[current_split.feature] != 0 and + # not criterion.check_monotonicity( + # monotonic_cst[current_split.feature], + # lower_bound, + # upper_bound, + # ) + # ): + # continue + + # # Reject if min_samples_leaf is not guaranteed + # if missing_go_to_left: + # n_left = current_split.pos - splitter.start + n_missing + # n_right = end_non_missing - current_split.pos + # else: + # n_left = current_split.pos - splitter.start + # n_right = end_non_missing - current_split.pos + n_missing for condition in splitter.presplit_conditions: - if not condition.f(splitter, condition.p): + if not condition.f( + splitter, ¤t_split, n_missing, missing_go_to_left, + lower_bound, upper_bound, condition.p + ): continue - if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: - continue + # if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: + # continue criterion.update(current_split.pos) - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): - continue + # # Reject if monotonicity constraints are not satisfied + # if ( + # with_monotonic_cst and + # monotonic_cst[current_split.feature] != 0 and + # not criterion.check_monotonicity( + # monotonic_cst[current_split.feature], + # lower_bound, + # upper_bound, + # ) + # ): + # continue for condition in splitter.postsplit_conditions: - if not condition.f(splitter, condition.p): + if not condition.f( + splitter, ¤t_split, n_missing, missing_go_to_left, + lower_bound, upper_bound, condition.p + ): continue - # Reject if min_weight_leaf is not satisfied - if splitter.check_postsplit_conditions() == 1: - continue + # # Reject if min_weight_leaf is not satisfied + # if splitter.check_postsplit_conditions() == 1: + # continue current_proxy_improvement = criterion.proxy_impurity_improvement() From 44f1d570fd0ba0503737c3f705e83f2ec7b8836a Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 18 Mar 2024 14:53:58 -0400 Subject: [PATCH 13/20] tree tests pass huzzah! --- sklearn/tree/_splitter.pxd | 4 ++++ sklearn/tree/_splitter.pyx | 36 ++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 9a400f3954b13..0edd4eb40231c 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -144,6 +144,10 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst + cdef SplitCondition min_samples_leaf_condition + cdef SplitCondition min_weight_leaf_condition + cdef SplitCondition monotonic_constraint_condition + cdef vector[SplitConditionTuple] presplit_conditions cdef vector[SplitConditionTuple] postsplit_conditions diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index bb21548ef4b31..983a6f89b4a43 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -66,9 +66,9 @@ cdef bint min_sample_leaf_condition( # Reject if min_samples_leaf is not guaranteed if n_left < min_samples_leaf or n_right < min_samples_leaf: - return 0 + return False - return 1 + return True cdef class MinSamplesLeafCondition(SplitCondition): def __cinit__(self): @@ -89,9 +89,9 @@ cdef bint min_weight_leaf_condition( # Reject if min_weight_leaf is not satisfied if ((splitter.criterion.weighted_n_left < min_weight_leaf) or (splitter.criterion.weighted_n_right < min_weight_leaf)): - return 0 + return False - return 1 + return True cdef class MinWeightLeafCondition(SplitCondition): def __cinit__(self): @@ -116,9 +116,9 @@ cdef bint monotonic_constraint_condition( upper_bound, ) ): - return 0 + return False - return 1 + return True cdef class MonotonicConstraintCondition(SplitCondition): def __cinit__(self): @@ -166,7 +166,7 @@ cdef bint alpha_regularity_condition( ) noexcept nogil: cdef AlphaRegularityParameters* p = split_condition_parameters - return 1 + return True cdef class AlphaRegularityCondition(SplitCondition): def __cinit__(self, float64_t alpha): @@ -304,8 +304,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitCondition[:] presplit_conditions, - SplitCondition[:] postsplit_conditions, + SplitCondition[:] presplit_conditions = None, + SplitCondition[:] postsplit_conditions = None, *argv ): """ @@ -657,6 +657,8 @@ cdef inline intp_t node_split_best( # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants + cdef bint conditions_hold = True + _init_split(&best_split, end) partitioner.init_node_split(start, end) @@ -771,12 +773,17 @@ cdef inline intp_t node_split_best( # n_left = current_split.pos - splitter.start # n_right = end_non_missing - current_split.pos + n_missing + conditions_hold = True for condition in splitter.presplit_conditions: if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, lower_bound, upper_bound, condition.p ): - continue + conditions_hold = False + break + + if not conditions_hold: + continue # if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: # continue @@ -795,13 +802,18 @@ cdef inline intp_t node_split_best( # ): # continue + conditions_hold = True for condition in splitter.postsplit_conditions: if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, lower_bound, upper_bound, condition.p ): - continue - + conditions_hold = False + break + + if not conditions_hold: + continue + # # Reject if min_weight_leaf is not satisfied # if splitter.check_postsplit_conditions() == 1: # continue From 4f19d53c1a57fd2e37739d5028f550eb5ba88ba4 Mon Sep 17 00:00:00 2001 From: scarliles Date: Mon, 18 Mar 2024 16:19:33 -0400 Subject: [PATCH 14/20] added some splitconditions to header --- sklearn/tree/_splitter.pxd | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 0edd4eb40231c..6c9d0d676142a 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -50,6 +50,15 @@ cdef struct SplitConditionTuple: cdef class SplitCondition: cdef SplitConditionTuple t +cdef class MinSamplesLeafCondition(SplitCondition): + pass + +cdef class MinWeightLeafCondition(SplitCondition): + pass + +cdef class MonotonicConstraintCondition(SplitCondition): + pass + cdef struct SplitRecord: # Data to track sample split From cb71be0cdb8be46b19bbdd91d6c5da4897359ff3 Mon Sep 17 00:00:00 2001 From: scarliles Date: Thu, 21 Mar 2024 10:33:33 -0400 Subject: [PATCH 15/20] commented out some sample code that was substantially increasing peak memory utilization in asv --- sklearn/tree/_splitter.pyx | 116 ++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 983a6f89b4a43..6b0a6950b7739 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -125,71 +125,71 @@ cdef class MonotonicConstraintCondition(SplitCondition): self.t.f = monotonic_constraint_condition self.t.p = NULL -cdef struct HasDataParameters: - int min_samples - -cdef bint has_data_condition( - Splitter splitter, - SplitRecord* current_split, - intp_t n_missing, - bint missing_go_to_left, - float64_t lower_bound, - float64_t upper_bound, - SplitConditionParameters split_condition_parameters -) noexcept nogil: - cdef HasDataParameters* p = split_condition_parameters - return splitter.n_samples >= p.min_samples - -cdef class HasDataCondition(SplitCondition): - def __cinit__(self, int min_samples): - self.t.f = has_data_condition - self.t.p = malloc(sizeof(HasDataParameters)) - (self.t.p).min_samples = min_samples +# cdef struct HasDataParameters: +# int min_samples + +# cdef bint has_data_condition( +# Splitter splitter, +# SplitRecord* current_split, +# intp_t n_missing, +# bint missing_go_to_left, +# float64_t lower_bound, +# float64_t upper_bound, +# SplitConditionParameters split_condition_parameters +# ) noexcept nogil: +# cdef HasDataParameters* p = split_condition_parameters +# return splitter.n_samples >= p.min_samples + +# cdef class HasDataCondition(SplitCondition): +# def __cinit__(self, int min_samples): +# self.t.f = has_data_condition +# self.t.p = malloc(sizeof(HasDataParameters)) +# (self.t.p).min_samples = min_samples - def __dealloc__(self): - if self.t.p is not NULL: - free(self.t.p) +# def __dealloc__(self): +# if self.t.p is not NULL: +# free(self.t.p) - super.__dealloc__(self) - -cdef struct AlphaRegularityParameters: - float64_t alpha - -cdef bint alpha_regularity_condition( - Splitter splitter, - SplitRecord* current_split, - intp_t n_missing, - bint missing_go_to_left, - float64_t lower_bound, - float64_t upper_bound, - SplitConditionParameters split_condition_parameters -) noexcept nogil: - cdef AlphaRegularityParameters* p = split_condition_parameters - - return True - -cdef class AlphaRegularityCondition(SplitCondition): - def __cinit__(self, float64_t alpha): - self.t.f = alpha_regularity_condition - self.t.p = malloc(sizeof(AlphaRegularityParameters)) - (self.t.p).alpha = alpha +# super.__dealloc__(self) + +# cdef struct AlphaRegularityParameters: +# float64_t alpha + +# cdef bint alpha_regularity_condition( +# Splitter splitter, +# SplitRecord* current_split, +# intp_t n_missing, +# bint missing_go_to_left, +# float64_t lower_bound, +# float64_t upper_bound, +# SplitConditionParameters split_condition_parameters +# ) noexcept nogil: +# cdef AlphaRegularityParameters* p = split_condition_parameters + +# return True + +# cdef class AlphaRegularityCondition(SplitCondition): +# def __cinit__(self, float64_t alpha): +# self.t.f = alpha_regularity_condition +# self.t.p = malloc(sizeof(AlphaRegularityParameters)) +# (self.t.p).alpha = alpha - def __dealloc__(self): - if self.t.p is not NULL: - free(self.t.p) +# def __dealloc__(self): +# if self.t.p is not NULL: +# free(self.t.p) - super.__dealloc__(self) +# super.__dealloc__(self) -from ._tree cimport Tree -cdef class FooTree(Tree): - cdef Splitter splitter +# from ._tree cimport Tree +# cdef class FooTree(Tree): +# cdef Splitter splitter - def __init__(self): - self.splitter = Splitter( - presplit_conditions = [HasDataCondition(10)], - postsplit_conditions = [AlphaRegularityCondition(0.1)], - ) +# def __init__(self): +# self.splitter = Splitter( +# presplit_conditions = [HasDataCondition(10)], +# postsplit_conditions = [AlphaRegularityCondition(0.1)], +# ) cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil: From 75146197bbab8d1fc7001a658c08f71a3103c788 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 26 Mar 2024 14:48:59 -0400 Subject: [PATCH 16/20] added vector resize to Splitter --- sklearn/tree/_splitter.pyx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 6b0a6950b7739..9042587d143a8 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -349,11 +349,21 @@ cdef class Splitter(BaseSplitter): self.min_samples_leaf_condition = MinSamplesLeafCondition() self.min_weight_leaf_condition = MinWeightLeafCondition() + self.presplit_conditions.resize( + 1 + + ( 0 if presplit_conditions is None else len(presplit_conditions) ) + + ( 1 if self.with_monotonic_cst else 0 ) + ) self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) if presplit_conditions is not None: for condition in presplit_conditions: self.presplit_conditions.push_back((condition).t) - + + self.postsplit_conditions.resize( + 1 + + ( 0 if postsplit_conditions is None else len(postsplit_conditions) ) + + ( 1 if self.with_monotonic_cst else 0 ) + ) self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) if postsplit_conditions is not None: for condition in postsplit_conditions: From 24bfd2245007ccc446bb94766bb3573b33bdcf63 Mon Sep 17 00:00:00 2001 From: scarliles Date: Wed, 27 Mar 2024 11:06:47 -0400 Subject: [PATCH 17/20] cython minutiae --- sklearn/tree/_splitter.pyx | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 9042587d143a8..a966047a1bbf0 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -349,21 +349,20 @@ cdef class Splitter(BaseSplitter): self.min_samples_leaf_condition = MinSamplesLeafCondition() self.min_weight_leaf_condition = MinWeightLeafCondition() - self.presplit_conditions.resize( - 1 - + ( 0 if presplit_conditions is None else len(presplit_conditions) ) - + ( 1 if self.with_monotonic_cst else 0 ) - ) + cdef intp_t n_conditions = 1 + + ( 0 if presplit_conditions is None else len(presplit_conditions) ) + + ( 1 if self.with_monotonic_cst else 0 ) + + self.presplit_conditions.resize(n_conditions) self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) if presplit_conditions is not None: for condition in presplit_conditions: self.presplit_conditions.push_back((condition).t) - self.postsplit_conditions.resize( - 1 - + ( 0 if postsplit_conditions is None else len(postsplit_conditions) ) - + ( 1 if self.with_monotonic_cst else 0 ) - ) + n_conditions = 1 + + ( 0 if postsplit_conditions is None else len(postsplit_conditions) ) + + ( 1 if self.with_monotonic_cst else 0 ) + self.postsplit_conditions.resize(n_conditions) self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) if postsplit_conditions is not None: for condition in postsplit_conditions: From 7e52d2a0877edb3ac6e856562ba5fa802a72bc78 Mon Sep 17 00:00:00 2001 From: scarliles Date: Wed, 27 Mar 2024 20:34:29 -0400 Subject: [PATCH 18/20] trying pointer to vector (instead of inline) in extension type --- sklearn/tree/_splitter.pxd | 4 ++-- sklearn/tree/_splitter.pyx | 31 ++++++++++++++++++++----------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 6c9d0d676142a..a41c9e0f9c4a4 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -157,8 +157,8 @@ cdef class Splitter(BaseSplitter): cdef SplitCondition min_weight_leaf_condition cdef SplitCondition monotonic_constraint_condition - cdef vector[SplitConditionTuple] presplit_conditions - cdef vector[SplitConditionTuple] postsplit_conditions + cdef vector[SplitConditionTuple] *presplit_conditions + cdef vector[SplitConditionTuple] *postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index a966047a1bbf0..244b080d94f9c 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -353,16 +353,18 @@ cdef class Splitter(BaseSplitter): + ( 0 if presplit_conditions is None else len(presplit_conditions) ) + ( 1 if self.with_monotonic_cst else 0 ) - self.presplit_conditions.resize(n_conditions) + self.presplit_conditions = new vector[SplitConditionTuple](n_conditions) + + n_conditions = 1 + + ( 0 if postsplit_conditions is None else len(postsplit_conditions) ) + + ( 1 if self.with_monotonic_cst else 0 ) + self.postsplit_conditions = new vector[SplitConditionTuple](n_conditions) + self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) if presplit_conditions is not None: for condition in presplit_conditions: self.presplit_conditions.push_back((condition).t) - n_conditions = 1 - + ( 0 if postsplit_conditions is None else len(postsplit_conditions) ) - + ( 1 if self.with_monotonic_cst else 0 ) - self.postsplit_conditions.resize(n_conditions) self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) if postsplit_conditions is not None: for condition in postsplit_conditions: @@ -374,6 +376,11 @@ cdef class Splitter(BaseSplitter): self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) + def __dealloc__(self): + del self.presplit_conditions + del self.postsplit_conditions + + def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -783,10 +790,11 @@ cdef inline intp_t node_split_best( # n_right = end_non_missing - current_split.pos + n_missing conditions_hold = True - for condition in splitter.presplit_conditions: - if not condition.f( + # for condition in splitter.presplit_conditions: + for i in range(splitter.presplit_conditions.size()): + if not splitter.presplit_conditions[0][i].f( splitter, ¤t_split, n_missing, missing_go_to_left, - lower_bound, upper_bound, condition.p + lower_bound, upper_bound, splitter.presplit_conditions[0][i].p ): conditions_hold = False break @@ -812,10 +820,11 @@ cdef inline intp_t node_split_best( # continue conditions_hold = True - for condition in splitter.postsplit_conditions: - if not condition.f( + # for condition in splitter.postsplit_conditions: + for i in range(splitter.postsplit_conditions.size()): + if not splitter.presplit_conditions[0][i].f( splitter, ¤t_split, n_missing, missing_go_to_left, - lower_bound, upper_bound, condition.p + lower_bound, upper_bound, splitter.presplit_conditions[0][i].p ): conditions_hold = False break From 53d7abbb70714032484b1a1c0aed109c5b974fdf Mon Sep 17 00:00:00 2001 From: scarliles Date: Wed, 27 Mar 2024 20:47:05 -0400 Subject: [PATCH 19/20] asv setup_cache failing with pointer to vector, experiment cost exceeding benefit, reverting --- sklearn/tree/_splitter.pxd | 4 ++-- sklearn/tree/_splitter.pyx | 30 ++++++------------------------ 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index a41c9e0f9c4a4..6c9d0d676142a 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -157,8 +157,8 @@ cdef class Splitter(BaseSplitter): cdef SplitCondition min_weight_leaf_condition cdef SplitCondition monotonic_constraint_condition - cdef vector[SplitConditionTuple] *presplit_conditions - cdef vector[SplitConditionTuple] *postsplit_conditions + cdef vector[SplitConditionTuple] presplit_conditions + cdef vector[SplitConditionTuple] postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 244b080d94f9c..c9d665cc70f56 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -349,17 +349,6 @@ cdef class Splitter(BaseSplitter): self.min_samples_leaf_condition = MinSamplesLeafCondition() self.min_weight_leaf_condition = MinWeightLeafCondition() - cdef intp_t n_conditions = 1 - + ( 0 if presplit_conditions is None else len(presplit_conditions) ) - + ( 1 if self.with_monotonic_cst else 0 ) - - self.presplit_conditions = new vector[SplitConditionTuple](n_conditions) - - n_conditions = 1 - + ( 0 if postsplit_conditions is None else len(postsplit_conditions) ) - + ( 1 if self.with_monotonic_cst else 0 ) - self.postsplit_conditions = new vector[SplitConditionTuple](n_conditions) - self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) if presplit_conditions is not None: for condition in presplit_conditions: @@ -376,11 +365,6 @@ cdef class Splitter(BaseSplitter): self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) - def __dealloc__(self): - del self.presplit_conditions - del self.postsplit_conditions - - def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -790,11 +774,10 @@ cdef inline intp_t node_split_best( # n_right = end_non_missing - current_split.pos + n_missing conditions_hold = True - # for condition in splitter.presplit_conditions: - for i in range(splitter.presplit_conditions.size()): - if not splitter.presplit_conditions[0][i].f( + for condition in splitter.presplit_conditions: + if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, - lower_bound, upper_bound, splitter.presplit_conditions[0][i].p + lower_bound, upper_bound, condition.p ): conditions_hold = False break @@ -820,11 +803,10 @@ cdef inline intp_t node_split_best( # continue conditions_hold = True - # for condition in splitter.postsplit_conditions: - for i in range(splitter.postsplit_conditions.size()): - if not splitter.presplit_conditions[0][i].f( + for condition in splitter.postsplit_conditions: + if not condition.f( splitter, ¤t_split, n_missing, missing_go_to_left, - lower_bound, upper_bound, splitter.presplit_conditions[0][i].p + lower_bound, upper_bound, condition.p ): conditions_hold = False break From 9d6091a1479da7f70614271767df439742834959 Mon Sep 17 00:00:00 2001 From: scarliles Date: Tue, 2 Apr 2024 18:07:13 -0400 Subject: [PATCH 20/20] pickle and vector resize refactoring --- sklearn/tree/_splitter.pxd | 9 ++-- sklearn/tree/_splitter.pyx | 105 ++++++++++++++++++++++++++++--------- 2 files changed, 84 insertions(+), 30 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 6c9d0d676142a..b365bc505652c 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -153,12 +153,11 @@ cdef class Splitter(BaseSplitter): cdef const cnp.int8_t[:] monotonic_cst cdef bint with_monotonic_cst - cdef SplitCondition min_samples_leaf_condition - cdef SplitCondition min_weight_leaf_condition - cdef SplitCondition monotonic_constraint_condition + cdef list _presplit_conditions + cdef list _postsplit_conditions - cdef vector[SplitConditionTuple] presplit_conditions - cdef vector[SplitConditionTuple] postsplit_conditions + cdef vector[SplitConditionTuple*] presplit_conditions + cdef vector[SplitConditionTuple*] postsplit_conditions cdef int init( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index c9d665cc70f56..0d9720addfbb7 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -44,6 +44,31 @@ cdef float32_t FEATURE_THRESHOLD = 1e-7 cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 +cdef class SplitCondition: + def __cinit__(self): + self._constitute_tuple(None) + + def __dealloc__(self): + if self.t.p is not NULL: + free(self.t.p) + + def _constitute_tuple(self): + raise NotImplementedError() + + def __getstate__(self): + return {} + + def __setstate__(self, d): + self._constitute_tuple(d) + + def __reduce__(self): + return ( + type(self), + (), + self.__getstate__() + ) + + cdef bint min_sample_leaf_condition( Splitter splitter, SplitRecord* current_split, @@ -71,7 +96,7 @@ cdef bint min_sample_leaf_condition( return True cdef class MinSamplesLeafCondition(SplitCondition): - def __cinit__(self): + def _constitute_tuple(self, d): self.t.f = min_sample_leaf_condition self.t.p = NULL # min_samples is stored in splitter, which is already passed to f @@ -94,7 +119,7 @@ cdef bint min_weight_leaf_condition( return True cdef class MinWeightLeafCondition(SplitCondition): - def __cinit__(self): + def _constitute_tuple(self, d): self.t.f = min_weight_leaf_condition self.t.p = NULL # min_weight_leaf is stored in splitter, which is already passed to f @@ -121,7 +146,7 @@ cdef bint monotonic_constraint_condition( return True cdef class MonotonicConstraintCondition(SplitCondition): - def __cinit__(self): + def _constitute_tuple(self, d): self.t.f = monotonic_constraint_condition self.t.p = NULL @@ -304,8 +329,8 @@ cdef class Splitter(BaseSplitter): float64_t min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, - SplitCondition[:] presplit_conditions = None, - SplitCondition[:] postsplit_conditions = None, + list presplit_conditions = [], + list postsplit_conditions = [], *argv ): """ @@ -346,32 +371,62 @@ cdef class Splitter(BaseSplitter): self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - self.min_samples_leaf_condition = MinSamplesLeafCondition() - self.min_weight_leaf_condition = MinWeightLeafCondition() - - self.presplit_conditions.push_back((self.min_samples_leaf_condition).t) - if presplit_conditions is not None: - for condition in presplit_conditions: - self.presplit_conditions.push_back((condition).t) + self._presplit_conditions = [] if presplit_conditions is None else presplit_conditions + self._postsplit_conditions = [] if postsplit_conditions is None else postsplit_conditions - self.postsplit_conditions.push_back((self.min_weight_leaf_condition).t) - if postsplit_conditions is not None: - for condition in postsplit_conditions: - self.postsplit_conditions.push_back((condition).t) + self._presplit_conditions.append(MinSamplesLeafCondition()) + self._postsplit_conditions.append(MinWeightLeafCondition()) if(self.with_monotonic_cst): - self.monotonic_constraint_condition = MonotonicConstraintCondition() - self.presplit_conditions.push_back((self.monotonic_constraint_condition).t) - self.postsplit_conditions.push_back((self.monotonic_constraint_condition).t) + self._presplit_conditions.append(MonotonicConstraintCondition()) + self._postsplit_conditions.append(MonotonicConstraintCondition()) + + self._constitute_split_conditions() + def _constitute_split_conditions(self): + for condition in self._presplit_conditions: + if not isinstance(condition, SplitCondition): + raise ValueError("All conditions must be of type SplitCondition") + self.presplit_conditions.push_back(&((condition).t)) + + for condition in self._postsplit_conditions: + if not isinstance(condition, SplitCondition): + raise ValueError("All conditions must be of type SplitCondition") + self.postsplit_conditions.push_back(&((condition).t)) + + def _constitute_split_conditions2(self): + self.presplit_conditions.resize(len(self._presplit_conditions)) + for i in range(len(self._presplit_conditions)): + if not isinstance(self._presplit_conditions[i], SplitCondition): + raise ValueError("All conditions must be of type SplitCondition") + self.presplit_conditions[i] = &((self._presplit_conditions[i]).t) + + self.postsplit_conditions.resize(len(self._postsplit_conditions)) + for i in range(len(self._postsplit_conditions)): + if not isinstance(self._postsplit_conditions[i], SplitCondition): + raise ValueError("All conditions must be of type SplitCondition") + self.postsplit_conditions[i] = &((self._postsplit_conditions[i]).t) + + def __setstate__(self, d): + super(Splitter, self).__setstate__(d) + self._constitute_split_conditions() + def __reduce__(self): - return (type(self), (self.criterion, - self.max_features, - self.min_samples_leaf, - self.min_weight_leaf, - self.random_state, - self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__()) + return ( + type(self), + ( + self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state, + self.monotonic_cst.base if self.monotonic_cst is not None else None, + self._presplit_conditions, + self._postsplit_conditions + ), + self.__getstate__() + ) cdef int init( self,