From e4d257d582cc670c34085a22562b94fbd5005af1 Mon Sep 17 00:00:00 2001 From: Perro2110 Date: Mon, 13 Jan 2025 14:41:55 +0100 Subject: [PATCH 01/44] fix if 1 lmf [TodoReview this] --- src/evaluate.jl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/evaluate.jl b/src/evaluate.jl index a460498..c84d198 100644 --- a/src/evaluate.jl +++ b/src/evaluate.jl @@ -323,7 +323,20 @@ function evaluaterule( checkmask, explanations = begin if compute_explanations # Note: This is kind of quick and dirty. - disjs = SoleLogics.disjuncts(SoleLogics.LeftmostDisjunctiveForm(antecedent(rule))) + + #disjs = SoleLogics.disjuncts(SoleLogics.LeftmostDisjunctiveForm(antecedent(rule))) + + ante = antecedent(rule) + if (ante isa SyntaxBranch) + # Radice disgiuntiva: trasformiamo in forma “disjunctive” e poi estraiamo i disgiunti + dnf = SoleLogics.LeftmostDisjunctiveForm(ante) + disjs = SoleLogics.disjuncts(dnf) + else + # Non è un OR in radice → un singolo disgiunto + disjs = [ante] + end + + checkmatrix = hcat([check(disj, X; kwargs...) for disj in disjs]...) # @show checkmatrix checkmask = map(any, eachrow(checkmatrix)) From 67af7723f3ba1be3e180580ab172641d45118697 Mon Sep 17 00:00:00 2001 From: giopaglia <24519853+giopaglia@users.noreply.github.com> Date: Mon, 20 Jan 2025 18:13:32 +0100 Subject: [PATCH 02/44] Add test, add iscomplete --- src/utils/models/ensembles.jl | 3 ++- test/runtests.jl | 2 +- test/test_tree.jl | 30 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 test/test_tree.jl diff --git a/src/utils/models/ensembles.jl b/src/utils/models/ensembles.jl index 7f1d6cb..645dc6f 100644 --- a/src/utils/models/ensembles.jl +++ b/src/utils/models/ensembles.jl @@ -95,7 +95,6 @@ struct DecisionEnsemble{O,T<:AbstractModel,A<:Base.Callable,W<:Union{Nothing,Abs O = Union{outcometype.(models)...} DecisionEnsemble{O}(models, args...; kwargs...) end - end @@ -105,6 +104,8 @@ modelstype(m::DecisionEnsemble{O,T}) where {O,T} = T models(m::DecisionEnsemble) = m.models nmodels(m::DecisionEnsemble) = length(models(m)) +iscomplete(m::DecisionEnsemble) = any(iscomplete.(models(m))) + aggregation(m::DecisionEnsemble) = m.aggregation weights(m::DecisionEnsemble) = m.weights # Returns the aggregation function, patched by weights if the model has them. diff --git a/test/runtests.jl b/test/runtests.jl index 8d95df5..2e946c0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,7 @@ end println("Julia version: ", VERSION) test_suites = [ - ("Models", ["base.jl", ]), + ("Models", ["base.jl", "test_tree.jl"]), ("Miscellaneous", ["misc.jl", ]), ("Parse", ["parse.jl", ]), ("Rules", ["juliacon2024.jl", ]), diff --git a/test/test_tree.jl b/test/test_tree.jl new file mode 100644 index 0000000..04910dc --- /dev/null +++ b/test/test_tree.jl @@ -0,0 +1,30 @@ +################################################ + # p + # ┌───────┴─────────────┐ + # │ r + # q ┌───┴───┐ + # │ s "yes" + # ┌───┴───┐ ┌───┴───┐ + # "yes" "no" "yes" "no" +################################################## + +using SoleLogics +using SoleModels + +formula_p = SoleLogics.parseformula("p") +formula_q = SoleLogics.parseformula("q") +formula_r = SoleLogics.parsebaseformula("r") +formula_s = SoleLogics.parsebaseformula("s") + +branch_q = Branch(formula_q,("yes","no"),(;)) +branch_s = Branch(formula_s,("yes","no"),(;)) +branch_r = Branch(formula_r,(branch_s,"yes"),(;)) + +#dt_q = DecisionTree(branch_r,(;)) + + +#Possibile path +path_all = [formula_p,formula_q,formula_s,formula_r,"yes"] +path_2 = [formula_p,formula_q,"yes"] +path_1 = [formula_p,"yes"] +path_0 = ["yes"] From a287352107a32d5cceb4672a98de99743b5f86f3 Mon Sep 17 00:00:00 2001 From: Perro2110 Date: Wed, 19 Feb 2025 17:33:35 +0100 Subject: [PATCH 03/44] refactoring name --- src/SoleModels.jl | 2 +- src/rule-extraction.jl | 16 ++++++++-------- src/utils/models/other.jl | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/SoleModels.jl b/src/SoleModels.jl index 6ba88eb..0fbd307 100644 --- a/src/SoleModels.jl +++ b/src/SoleModels.jl @@ -93,7 +93,7 @@ export subtreeheight include("symbolic-utils.jl") export PlainRuleExtractor -export extractrules, listrules, joinrules +export modalextractrules, listrules, joinrules include("rule-extraction.jl") diff --git a/src/rule-extraction.jl b/src/rule-extraction.jl index 50a86c0..d94fb05 100644 --- a/src/rule-extraction.jl +++ b/src/rule-extraction.jl @@ -6,7 +6,7 @@ An exact or heuristic logical method for extracting logical rule from symbolic m Refer to [SolePostHoc](https://github.com/aclai-lab/SolePostHoc.jl) for rule extraction methods. -See also [`extractrules`](@ref), [`Rule`](@ref)], [`issymbolicmodel`](@ref). +See also [`modalextractrules`](@ref), [`Rule`](@ref)], [`issymbolicmodel`](@ref). """ abstract type RuleExtractor end @@ -16,33 +16,33 @@ Return whether a rule extraction method is known to be exact (as opposed to heur isexact(::RuleExtractor) = false """ - extractrules(re::RuleExtractor, m, args...; kwargs...) + modalextractrules(re::RuleExtractor, m, args...; kwargs...) Extract rules from symbolic model `m`, using a rule extraction method `re`. """ -function extractrules(re::RuleExtractor, m, args...; kwargs...) - return error("Please, provide method extractrules(::$(typeof(m)), args...; kwargs...).") +function modalextractrules(re::RuleExtractor, m, args...; kwargs...) + return error("Please, provide method modalextractrules(::$(typeof(m)), args...; kwargs...).") end # Helpers function (RE::Type{<:RuleExtractor})(args...; kwargs...) - return extractrules(RE(), args...; kwargs...) + return modalextractrules(RE(), args...; kwargs...) end # Helpers function (re::RuleExtractor)(args...; kwargs...) - return extractrules(re, args...; kwargs...) + return modalextractrules(re, args...; kwargs...) end """ Plain extraction method involves listing one rule per each possible symbolic path within the model. -With this method, [`extractrules`](@ref) redirects to [`listrules`](@ref). +With this method, [`modalextractrules`](@ref) redirects to [`listrules`](@ref). See also [`listrules`](@ref), [`Rule`](@ref)], [`issymbolicmodel`](@ref). """ struct PlainRuleExtractor <: RuleExtractor end isexact(::PlainRuleExtractor) = true -function extractrules(::PlainRuleExtractor, m, args...; kwargs...) +function modalextractrules(::PlainRuleExtractor, m, args...; kwargs...) if haslistrules(m) listrules(m, args...; kwargs...) else diff --git a/src/utils/models/other.jl b/src/utils/models/other.jl index 60f79f6..bb9280b 100644 --- a/src/utils/models/other.jl +++ b/src/utils/models/other.jl @@ -361,7 +361,7 @@ iscomplete(m::DecisionSet) = m.iscomplete isnonoverlapping(m::DecisionSet) = m.isnonoverlapping function listrules(m::DecisionSet) - isnonoverlapping || error("Cannot listrules from non-overlapping decision set. Try `extractrules` with heuristics, instead.") + isnonoverlapping || error("Cannot listrules from non-overlapping decision set. Try `modalextractrules` with heuristics, instead.") rules(m) end From b2312c1d053a8bf3e952d5e9588e8b843b052692 Mon Sep 17 00:00:00 2001 From: Perro2110 Date: Tue, 11 Mar 2025 21:38:36 +0100 Subject: [PATCH 04/44] minor fix --- src/evaluate.jl | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/evaluate.jl b/src/evaluate.jl index c84d198..4401ce3 100644 --- a/src/evaluate.jl +++ b/src/evaluate.jl @@ -350,11 +350,24 @@ function evaluaterule( end pos_checkmask = checkmask[classmask] neg_checkmask = checkmask[(!).(classmask)] + + #= + println("pos_checkmask: ",pos_checkmask) + println("neg_checkmask: ",neg_checkmask) + + println("lgh pos_checkmask ",length(pos_checkmask)) + println("lgh neg_checkmask ",length(neg_checkmask)) + =# + + # Controlli per array vuoti + sensitivity = length(pos_checkmask) > 0 ? sum(pos_checkmask)/length(pos_checkmask) : 0.0 + specificity = length(neg_checkmask) > 0 ? 1-(sum(neg_checkmask)/length(neg_checkmask)) : 1.0 + out = (; classmask = classmask, checkmask = checkmask, - sensitivity = sum(pos_checkmask)/length(pos_checkmask), - specificity = 1-(sum(neg_checkmask)/length(neg_checkmask)), + sensitivity = sensitivity, + specificity = specificity, explanations = explanations, ) return out From 4bc2854394c93f9f67f1c8c7f2acdc7300712459 Mon Sep 17 00:00:00 2001 From: Perro2110 Date: Sat, 29 Mar 2025 10:21:56 +0100 Subject: [PATCH 05/44] update image_family --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 0a3671d..013f726 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -1,5 +1,5 @@ freebsd_instance: - image_family: freebsd-14-0 + image_family: freebsd-14-2 task: name: FreeBSD artifacts_cache: From afd11612e7e992aca296af414bb0d76bbd6c6c29 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 29 Mar 2025 21:54:24 +0100 Subject: [PATCH 06/44] decisiontree and xgboost ext working --- ext/DecisionTreeExt.jl | 11 ++- ext/XGBoostExt.jl | 157 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 2 deletions(-) diff --git a/ext/DecisionTreeExt.jl b/ext/DecisionTreeExt.jl index de67fdb..1755fb5 100644 --- a/ext/DecisionTreeExt.jl +++ b/ext/DecisionTreeExt.jl @@ -9,7 +9,7 @@ import DecisionTree as DT function get_condition(featid, featval, featurenames) test_operator = (<) # @show fieldnames(typeof(tree)) - feature = !isnothing(featurenames) ? VariableValue(featurenames[featid]) : VariableValue(featid) + feature = isnothing(featurenames) ? VariableValue(featid) : VariableValue(featid, featurenames[featid]) return ScalarCondition(feature, test_operator, featval) end @@ -106,9 +106,16 @@ function SoleModels.solemodel( return m end -function SoleModels.solemodel(tree::DT.InfoNode; keep_condensed = false, featurenames = true, classlabels = tree.info.classlabels, kwargs...) +function SoleModels.solemodel( + tree::DT.InfoNode{T,orig_O}; + keep_condensed=false, + featurenames=true, + # classlabels=tree.info.classlabels, + kwargs... +) where {T,orig_O} # @show fieldnames(typeof(tree)) featurenames = featurenames == true ? tree.info.featurenames : featurenames + classlabels = haskey(tree.info, :classlabels) ? tree.info.classlabels : nothing root, info = begin if keep_condensed diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index ad346ae..7925a6b 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -89,5 +89,162 @@ end # return Branch(antecedent, left_tree, right_tree, info) # end +function get_condition(featidstr, featval, featurenames; test_operator) + featid = parse(Int, featidstr[2:end]) + 1 # considering 0-based indexing in XGBoost feature ids + feature = isnothing(featurenames) ? VariableValue(featid) : VariableValue(featid, featurenames[featid]) + return ScalarCondition(feature, test_operator, featval) +end + +function satisfies_conditions(row, formula) + # check_cond = true + # for atom in formula + # if !atom.value.metacond.test_operator(row[atom.value.metacond.feature.i_variable], atom.value.threshold) + # check_cond = false + # end + # end + # return check_cond + + all(atom -> atom.value.metacond.test_operator( + row[atom.value.metacond.feature.i_variable], + atom.value.threshold), formula + ) +end + +function bitmap_check_conditions(X, formula) + BitVector([satisfies_conditions(row, formula) for row in eachrow(X)]) +end + +function SoleModels.solemodel( + model::Vector{<:XGBoost.Node}, + # args...; + X::AbstractMatrix, + y::AbstractVector; + weights::Union{AbstractVector{<:Number}, Nothing}=nothing, + classlabels = nothing, + featurenames = nothing, + keep_condensed = false, + kwargs... +) + # TODO + if keep_condensed && !isnothing(classlabels) + # info = (; + # apply_preprocess=(y -> orig_O(findfirst(x -> x == y, classlabels))), + # apply_postprocess=(y -> classlabels[y]), + # ) + info = (; + apply_preprocess=(y -> findfirst(x -> x == y, classlabels)), + apply_postprocess=(y -> classlabels[y]), + ) + keep_condensed = !keep_condensed + # O = eltype(classlabels) + else + info = (;) + # O = orig_O + end + + trees = map(t -> begin + # isnothing(t.split) ? + # xgbleaf(t, Formula[], X, y; classlabels, featurenames) : + SoleModels.solemodel(t, X, y; classlabels, featurenames, keep_condensed, kwargs...) + end, model) + + if !isnothing(featurenames) + info = merge(info, (; featurenames=featurenames, )) + end + + info = merge(info, (; + leaf_values=vcat([t.info[:leaf_values] for t in trees]...), + supporting_predictions=vcat([t.info[:supporting_predictions] for t in trees]...), + supporting_labels=vcat([t.info[:supporting_labels] for t in trees]...), + ) + ) + + return isnothing(weights) ? + DecisionEnsemble(trees, info) : + DecisionEnsemble(trees, weights, info) +end + +""" + solemodel(tree::XGBoost.Node; fl=Formula[], fr=Formula[], classlabels=nothing, featurenames=nothing, keep_condensed=false) + +Traverses a learned XGBoost tree, collecting the path conditions for each branch. +Left paths (<) store conditions in `fl`, right paths (≥) store conditions in `fr`. +When reaching a leaf, calls `xgbleaf` with the path's collected conditions. +""" +function SoleModels.solemodel( + tree::XGBoost.Node, + X::AbstractMatrix, + y::AbstractVector; + path_conditions = Formula[], + classlabels=nothing, + featurenames=nothing, + keep_condensed=false +) + keep_condensed && error("Cannot keep condensed XGBoost.Node.") + + # xgboost trees could be composed of only one leaf, without any split + # isnothing(tree.split) && return nothing + isnothing(tree.split) && return xgbleaf(tree, Formula[], X, y; classlabels, featurenames) + + antecedent = Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<))) + + # Create a new path for the left branch + left_path = copy(path_conditions) + push!(left_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<)))) + + # Create a new path for the right branch + right_path = copy(path_conditions) + push!(right_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≥)))) + + lefttree = if isnothing(tree.children[1].split) + # @show SoleModels.join_antecedents(left_path) + xgbleaf(tree.children[1], left_path, X, y; classlabels, featurenames) + else + SoleModels.solemodel(tree.children[1], X, y; path_conditions=left_path, classlabels=classlabels, featurenames=featurenames) + end + isnothing(lefttree) && return Nothing + + righttree = if isnothing(tree.children[2].split) + # @show SoleModels.join_antecedents(right_path) + xgbleaf(tree.children[2], right_path, X, y; classlabels, featurenames) + else + SoleModels.solemodel(tree.children[2], X, y; path_conditions=right_path, classlabels=classlabels, featurenames=featurenames) + end + isnothing(righttree) && return Nothing + + info = (; + leaf_values = [lefttree.info[:leaf_values]..., righttree.info[:leaf_values]...], + supporting_predictions = [lefttree.info[:supporting_predictions]..., righttree.info[:supporting_predictions]...], + supporting_labels = [lefttree.info[:supporting_labels]..., righttree.info[:supporting_labels]...], + ) + return Branch(antecedent, lefttree, righttree, info) +end + +function xgbleaf( + leaf::XGBoost.Node, + formula::Vector{<:Formula}, + X::AbstractMatrix, + y::AbstractVector; + classlabels=nothing, + featurenames=nothing, + keep_condensed=false +) + keep_condensed && error("Cannot keep condensed XGBoost.Node.") + + bitX = bitmap_check_conditions(X, formula) + push!(bitX, 0) + + labels = unique(y) + prediction = SoleModels.bestguess(y[bitX]; suppress_parity_warning=true) + + isnothing(prediction) && (prediction = labels[findfirst(x -> x == "nothing", labels)]) + + info = (; + leaf_values = leaf.leaf, + supporting_predictions = fill(prediction, length(labels)), + supporting_labels = labels, +) + return SoleModels.ConstantModel(prediction, info) +end end From 183de681bad596831350f0c10d54b419236ad62a Mon Sep 17 00:00:00 2001 From: Perro2110 Date: Tue, 1 Apr 2025 21:29:53 +0200 Subject: [PATCH 07/44] minor fix in Project.toml for compatibility SoleBase = "0.11 - 0.13" --- Project.toml | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index 23b221e..e589a0b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,12 @@ name = "SoleModels" uuid = "4249d9c7-3290-4ddd-961c-e1d3ec2467f8" license = "MIT" -authors = ["Michele GHIOTTI", "Giovanni PAGLIARINI", "Edoardo PONSANESI", "Eduard I. STAN"] +authors = [ + "Michele GHIOTTI", + "Giovanni PAGLIARINI", + "Edoardo PONSANESI", + "Eduard I. STAN", +] version = "0.10.0" [deps] @@ -61,9 +66,9 @@ ProgressMeter = "1" Random = "1" Reexport = "1" Revise = "3" -SoleBase = "0.13" +SoleBase = "0.11 - 0.13" SoleData = "0.15, 0.16" -SoleLogics = "0.11 - 0.12" +SoleLogics = "0.11 - 0.13" StatsBase = "0.30 - 0.34" Suppressor = "0.2" Tables = "1" @@ -89,4 +94,19 @@ SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "DataFrames", "Random", "MLJ", "MLJXGBoostInterface", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJDecisionTreeInterface", "SoleData"] +test = [ + "Test", + "DataFrames", + "Random", + "MLJ", + "MLJXGBoostInterface", + "MultiData", + "Markdown", + "InteractiveUtils", + "BenchmarkTools", + "MLJBase", + "XGBoost", + "DecisionTree", + "MLJDecisionTreeInterface", + "SoleData", +] From 8ee5cf383c5443e1dae38b7d0bb1c121a9709e4e Mon Sep 17 00:00:00 2001 From: giopaglia <24519853+giopaglia@users.noreply.github.com> Date: Wed, 2 Apr 2025 18:02:26 +0200 Subject: [PATCH 08/44] Fix --- test/misc.jl | 1 - test/test_tree.jl | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/test/misc.jl b/test/misc.jl index cbafaf2..84245ac 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -11,7 +11,6 @@ using SoleModels: listrules, displaymodel, submodels io = IOBuffer() -# parse_other_kind_of_formula = SoleLogics.parsebaseformula parse_other_kind_of_formula = SoleLogics.parseformula ################################### LeafModel ############################################# diff --git a/test/test_tree.jl b/test/test_tree.jl index 04910dc..af4cb1b 100644 --- a/test/test_tree.jl +++ b/test/test_tree.jl @@ -13,8 +13,8 @@ using SoleModels formula_p = SoleLogics.parseformula("p") formula_q = SoleLogics.parseformula("q") -formula_r = SoleLogics.parsebaseformula("r") -formula_s = SoleLogics.parsebaseformula("s") +formula_r = SoleLogics.parseformula("r") +formula_s = SoleLogics.parseformula("s") branch_q = Branch(formula_q,("yes","no"),(;)) branch_s = Branch(formula_s,("yes","no"),(;)) From d870851a8de02865e11dfa0560741c3ab0121fb3 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Wed, 2 Apr 2025 21:35:15 +0200 Subject: [PATCH 09/44] debug xgboost, not yet ready --- test/DecisionTreeExt/forest.jl | 9 ++-- test/DecisionTreeExt/tree.jl | 7 ++- test/XgBoostExt/xgboost.jl | 82 ++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 5 deletions(-) create mode 100644 test/XgBoostExt/xgboost.jl diff --git a/test/DecisionTreeExt/forest.jl b/test/DecisionTreeExt/forest.jl index 4e0d1b5..f82afd2 100644 --- a/test/DecisionTreeExt/forest.jl +++ b/test/DecisionTreeExt/forest.jl @@ -6,6 +6,7 @@ using DataFrames using MLJDecisionTreeInterface using SoleModels +using Random import DecisionTree as DT @@ -13,8 +14,9 @@ X, y = @load_iris X = DataFrame(X) train_ratio = 0.8 +rng = Xoshiro(11) -train, test = partition(eachindex(y), train_ratio, shuffle=true) +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) X_train, y_train = X[train, :], y[train] X_test, y_test = X[test, :], y[test] @@ -52,10 +54,11 @@ preds = apply(solem, X_test) preds2 = apply!(solem, X_test, y_test) @test preds == preds2 -@test sum(preds .== y_test)/length(y_test) >= 0.8 +accuracy = sum(preds .== y_test)/length(y_test) +@test accuracy >= 0.8 # apply!(solem, X_test, y_test, mode = :append) printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) -@test_broken printmodel.(listrules(solem, min_lift = 1.0, min_ninstances = 0); show_metrics = true); +# @test_broken printmodel.(listrules(solem, min_lift = 1.0, min_ninstances = 0); show_metrics = true); diff --git a/test/DecisionTreeExt/tree.jl b/test/DecisionTreeExt/tree.jl index 6936d00..a0c349a 100644 --- a/test/DecisionTreeExt/tree.jl +++ b/test/DecisionTreeExt/tree.jl @@ -6,6 +6,7 @@ using DataFrames using MLJDecisionTreeInterface using SoleModels +using Random import DecisionTree as DT @@ -13,8 +14,9 @@ X, y = @load_iris X = DataFrame(X) train_ratio = 0.8 +rng = Xoshiro(11) -train, test = partition(eachindex(y), train_ratio, shuffle=true) +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) X_train, y_train = X[train, :], y[train] X_test, y_test = X[test, :], y[test] @@ -47,7 +49,8 @@ preds = apply(solem, X_test) preds2 = apply!(solem, X_test, y_test) @test preds == preds2 -@test sum(preds .== y_test)/length(y_test) > 0.7 +accuracy = sum(preds .== y_test)/length(y_test) +@test accuracy > 0.7 # apply!(solem, X_test, y_test, mode = :append) diff --git a/test/XgBoostExt/xgboost.jl b/test/XgBoostExt/xgboost.jl new file mode 100644 index 0000000..b8d9a24 --- /dev/null +++ b/test/XgBoostExt/xgboost.jl @@ -0,0 +1,82 @@ +using Test + +using MLJ +using MLJBase +using DataFrames + +using MLJXGBoostInterface +using SoleModels + +import XGBoost as XGB + +using Random, CategoricalArrays + +X, y = @load_iris +X = DataFrame(X) + +train_ratio = 0.8 +rng = Xoshiro(11) + +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) +X_train, y_train = X[train, :], y[train] +X_test, y_test = X[test, :], y[test] + +println("Training set size: ", size(X_train), " - ", size(y_train)) +println("Test set size: ", size(X_test), " - ", size(y_test)) +println("Training set type: ", typeof(X_train), " - ", typeof(y_train)) +println("Test set type: ", typeof(X_test), " - ", typeof(y_test)) + +XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost + +model = XGTrees(; + num_round=10, + max_depth=6, + objective="multi:softmax" +) + +# Bind the model and data into a machine +mach = machine(model, X_train, y_train) +# Fit the model +fit!(mach) + +trees = XGB.trees(mach.fitresult[1]) + +featurenames = mach.report.vals[1][1] +ds_safetest = vcat(y, "nothing") + + +solem = solemodel(trees, Matrix(X), ds_safetest) +solem = solemodel(trees, Matrix(X), ds_safetest; featurenames) +solem = solemodel(trees, Matrix(X), ds_safetest; featurenames, keep_condensed = false) + +@test SoleData.scalarlogiset(X_test; allow_propositional = true) isa PropositionalLogiset + +# Make test instances flow into the model +preds = apply(solem, X_test) +preds2 = apply!(solem, X_test, y_test) + +@test preds == preds2 +accuracy = sum(preds .== y_test)/length(y_test) +@test accuracy > 0.7 + +# apply!(solem, X_test, y_test, mode = :append) + +solem = @test_throws ErrorException solemodel(trees, Matrix(X), ds_safetest; featurenames, keep_condensed = true) +solem = @test_nowarn solemodel(trees, Matrix(X), ds_safetest; featurenames, keep_condensed = false) + +printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) + +# comparision with XGBoost.jl + +yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 +# create and train a gradient boosted tree model of 5 trees +bst = XGB.xgboost( + (X_train, yl_train), + num_round=10, + num_class=3, + max_depth=6, + objective="multi:softmax" +) +# obtain model predictions +ŷ = XGB.predict(bst, X_test) + From 5de4302bb82c6a0f1d8cb2f83812d5287f513a77 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Thu, 10 Apr 2025 00:29:09 +0200 Subject: [PATCH 10/44] Refining XGBoost --- ext/XGBoostExt.jl | 157 ++++++------ src/SoleModels.jl | 2 + src/print.jl | 2 +- src/utils/models/ensembles.jl | 354 +++++++++++++++++++--------- src/utils/models/leaf.jl | 37 +++ src/utils/models/rule-and-branch.jl | 29 +++ test/XgBoostExt/xgboost.jl | 68 +++++- 7 files changed, 448 insertions(+), 201 deletions(-) diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index 7925a6b..78d7031 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -3,6 +3,8 @@ module XGBoostExt using SoleModels using XGBoost +using CategoricalArrays + import SoleModels: alphabet, solemodel function alphabet(model::XGBoost.Booster; kwargs...) @@ -31,8 +33,8 @@ function alphabet(model::XGBoost.Booster; kwargs...) _alphabet!(Atom{ScalarCondition}[], model; kwargs...) end - # TODO fix and test. Problem: where are the tree weights? How do I write this in the multi-class case? +# leaf values are actually the weight of the tree # # Convert an XGBoost.Booster to a Sole Ensemble # function solemodel(model::XGBoost.Booster; with_stats::Bool = true, kwargs...) @@ -95,15 +97,12 @@ function get_condition(featidstr, featval, featurenames; test_operator) return ScalarCondition(feature, test_operator, featval) end -function satisfies_conditions(row, formula) - # check_cond = true - # for atom in formula - # if !atom.value.metacond.test_operator(row[atom.value.metacond.feature.i_variable], atom.value.threshold) - # check_cond = false - # end - # end - # return check_cond +function get_condition(class_idx, featurenames; test_operator, featval) + feature = isnothing(featurenames) ? VariableValue(class_idx) : VariableValue(class_idx, featurenames[class_idx]) + return ScalarCondition(feature, test_operator, featval) +end +function satisfies_conditions(row, formula) all(atom -> atom.value.metacond.test_operator( row[atom.value.metacond.feature.i_variable], atom.value.threshold), formula @@ -114,54 +113,59 @@ function bitmap_check_conditions(X, formula) BitVector([satisfies_conditions(row, formula) for row in eachrow(X)]) end +function early_return(leaf, antecedent, clabel, classl) + info =(; + leaf_values = leaf, + supporting_predictions = clabel, + supporting_labels = [classl], + ) + + return Branch( + antecedent, + SoleModels.ConstantModel(first(clabel), info), + SoleModels.ConstantModel(first(clabel), info), + info + ) +end + +# ---------------------------------------------------------------------------- # +# DecisionXGBoost solemodel # +# ---------------------------------------------------------------------------- # function SoleModels.solemodel( model::Vector{<:XGBoost.Node}, - # args...; X::AbstractMatrix, y::AbstractVector; - weights::Union{AbstractVector{<:Number}, Nothing}=nothing, - classlabels = nothing, - featurenames = nothing, - keep_condensed = false, + classlabels, + featurenames=nothing, + keep_condensed=false, kwargs... ) - # TODO - if keep_condensed && !isnothing(classlabels) - # info = (; - # apply_preprocess=(y -> orig_O(findfirst(x -> x == y, classlabels))), - # apply_postprocess=(y -> classlabels[y]), - # ) - info = (; - apply_preprocess=(y -> findfirst(x -> x == y, classlabels)), - apply_postprocess=(y -> classlabels[y]), - ) - keep_condensed = !keep_condensed - # O = eltype(classlabels) - else - info = (;) - # O = orig_O - end - - trees = map(t -> begin - # isnothing(t.split) ? - # xgbleaf(t, Formula[], X, y; classlabels, featurenames) : - SoleModels.solemodel(t, X, y; classlabels, featurenames, keep_condensed, kwargs...) - end, model) - - if !isnothing(featurenames) - info = merge(info, (; featurenames=featurenames, )) + keep_condensed && error("Cannot keep condensed XGBoost.Node.") + + nclasses = length(classlabels) + + trees = map(enumerate(model)) do (i, t) + class_idx = (i - 1) % nclasses + 1 + clabels = categorical([classlabels[class_idx]]) + # xgboost trees could be composed of only one leaf, without any split + if isnothing(t.split) + antecedent = Atom(get_condition(class_idx, featurenames; test_operator=(<), featval=Inf)) + early_return(t.leaf, antecedent, clabels, classlabels[class_idx]) + else + SoleModels.solemodel(t, X, y; classlabels, featurenames, class_idx, clabels, kwargs...) + end end - info = merge(info, (; - leaf_values=vcat([t.info[:leaf_values] for t in trees]...), - supporting_predictions=vcat([t.info[:supporting_predictions] for t in trees]...), - supporting_labels=vcat([t.info[:supporting_labels] for t in trees]...), + info = merge( + isnothing(featurenames) ? (;) : (;featurenames=featurenames), + (; + leaf_values = reduce(vcat, getindex.(getproperty.(trees, :info), :leaf_values)), + supporting_predictions = reduce(vcat, getindex.(getproperty.(trees, :info), :supporting_predictions)), + supporting_labels = reduce(vcat, getindex.(getproperty.(trees, :info), :supporting_labels)) ) ) - return isnothing(weights) ? - DecisionEnsemble(trees, info) : - DecisionEnsemble(trees, weights, info) + return DecisionXGBoost(trees, info) end """ @@ -175,42 +179,43 @@ function SoleModels.solemodel( tree::XGBoost.Node, X::AbstractMatrix, y::AbstractVector; - path_conditions = Formula[], - classlabels=nothing, + classlabels, + path_conditions=Formula[], featurenames=nothing, - keep_condensed=false + class_idx, + clabels ) - keep_condensed && error("Cannot keep condensed XGBoost.Node.") - - # xgboost trees could be composed of only one leaf, without any split - # isnothing(tree.split) && return nothing - isnothing(tree.split) && return xgbleaf(tree, Formula[], X, y; classlabels, featurenames) - antecedent = Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<))) - - # Create a new path for the left branch + + # create a new path for the left branch left_path = copy(path_conditions) push!(left_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<)))) - # Create a new path for the right branch + # create a new path for the right branch right_path = copy(path_conditions) push!(right_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≥)))) lefttree = if isnothing(tree.children[1].split) # @show SoleModels.join_antecedents(left_path) - xgbleaf(tree.children[1], left_path, X, y; classlabels, featurenames) + xgbleaf(tree.children[1], left_path, X, y) else - SoleModels.solemodel(tree.children[1], X, y; path_conditions=left_path, classlabels=classlabels, featurenames=featurenames) + SoleModels.solemodel(tree.children[1], X, y; path_conditions=left_path, classlabels, class_idx, clabels,featurenames) end - isnothing(lefttree) && return Nothing - + isnothing(lefttree) && + begin + return early_return(tree.children[1].leaf, antecedent, clabels, classlabels[class_idx]) + end + righttree = if isnothing(tree.children[2].split) # @show SoleModels.join_antecedents(right_path) - xgbleaf(tree.children[2], right_path, X, y; classlabels, featurenames) + xgbleaf(tree.children[2], right_path, X, y) else - SoleModels.solemodel(tree.children[2], X, y; path_conditions=right_path, classlabels=classlabels, featurenames=featurenames) + SoleModels.solemodel(tree.children[2], X, y; path_conditions=right_path, classlabels, class_idx, clabels, featurenames) + end + isnothing(righttree) && + begin + return early_return(tree.children[2].leaf, antecedent, clabels, classlabels[class_idx]) end - isnothing(righttree) && return Nothing info = (; leaf_values = [lefttree.info[:leaf_values]..., righttree.info[:leaf_values]...], @@ -224,26 +229,20 @@ function xgbleaf( leaf::XGBoost.Node, formula::Vector{<:Formula}, X::AbstractMatrix, - y::AbstractVector; - classlabels=nothing, - featurenames=nothing, - keep_condensed=false + y::AbstractVector ) - keep_condensed && error("Cannot keep condensed XGBoost.Node.") - bitX = bitmap_check_conditions(X, formula) - push!(bitX, 0) - - labels = unique(y) prediction = SoleModels.bestguess(y[bitX]; suppress_parity_warning=true) + labels = unique(y) - isnothing(prediction) && (prediction = labels[findfirst(x -> x == "nothing", labels)]) + isnothing(prediction) && return nothing info = (; - leaf_values = leaf.leaf, - supporting_predictions = fill(prediction, length(labels)), - supporting_labels = labels, -) + leaf_values = leaf.leaf, + supporting_predictions = fill(prediction, length(labels)), + supporting_labels = labels, + ) + return SoleModels.ConstantModel(prediction, info) end diff --git a/src/SoleModels.jl b/src/SoleModels.jl index 6ba88eb..a3ae567 100644 --- a/src/SoleModels.jl +++ b/src/SoleModels.jl @@ -62,6 +62,8 @@ export DecisionEnsemble, models export DecisionForest, trees export DecisionSet, rules, nrules +export DecisionXGBoost + export MixedModel export haslistrules, solemodel diff --git a/src/print.jl b/src/print.jl index b111210..efc4e13 100644 --- a/src/print.jl +++ b/src/print.jl @@ -521,7 +521,7 @@ end function printmodel( io::IO, - m::DecisionEnsemble; + m::Union{DecisionEnsemble, DecisionXGBoost}; header = DEFAULT_HEADER, indentation_str = "", indentation = default_indentation, diff --git a/src/utils/models/ensembles.jl b/src/utils/models/ensembles.jl index 7f1d6cb..6738264 100644 --- a/src/utils/models/ensembles.jl +++ b/src/utils/models/ensembles.jl @@ -252,123 +252,119 @@ function ntrees(m::DecisionForest) length(trees(m)) end - - - -""" -A `MaxDecisionBag` is an ensemble of models, weighted by a set of other models. -In this simplified implementation, only the model with the highest (`max`) weight is responsible for the outcome. - -See also [`DecisionForest`](@ref), [`DecisionTree`](@ref), [`DecisionEnsemble`](@ref), [`MaxDecisionBag`](@ref). -""" -struct MaxDecisionBag{O,TO<:AbstractModel,TU<:AbstractModel - # ,A<:Base.Callable - # ,W<:Union{Nothing,AbstractVector} - } <: AbstractDecisionEnsemble{O} - output_producing_models::Vector{TO} - weight_producing_models::Vector{TU} - # aggregation::A - # weights::W - info::NamedTuple - - function MaxDecisionBag{O}( - output_producing_models::Vector, - weight_producing_models::Vector, - # aggregation::Union{Nothing,Base.Callable}, - # weights::Union{Nothing,AbstractVector}, - info::NamedTuple = (;); - suppress_parity_warning = nothing, - ) where {O} - @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" - @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" - @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." - output_producing_models = wrap.(output_producing_models) - weight_producing_models = wrap.(weight_producing_models) - # if isnothing(aggregation) - # # if a suppress_parity_warning parameter is provided, then the aggregation's suppress_parity_warning defaults to it; - # # otherwise, it defaults to bestguess's suppress_parity_warning - # if isnothing(suppress_parity_warning) - # aggregation = function (args...; kwargs...) bestguess(args...; kwargs...) end - # else - # aggregation = function (args...; suppress_parity_warning = suppress_parity_warning, kwargs...) bestguess(args...; suppress_parity_warning, kwargs...) end - # end - # else - # isnothing(suppress_parity_warning) || @warn "Unexpected value for suppress_parity_warning: $(suppress_parity_warning)." - # end - TO = typeof(output_producing_models) - TU = typeof(weight_producing_models) - # W = typeof(weights) - # A = typeof(aggregation) - new{O,TO,TU}(output_producing_models, weight_producing_models, aggregation, info) # , weights - end +# """ +# A `MaxDecisionBag` is an ensemble of models, weighted by a set of other models. + +# See also [`DecisionForest`](@ref), [`DecisionTree`](@ref), [`DecisionEnsemble`](@ref), [`MaxDecisionBag`](@ref). +# """ +# struct DecisionBag{O,TO<:AbstractModel,TU<:AbstractModel +# # ,A<:Base.Callable +# # ,W<:Union{Nothing,AbstractVector} +# } <: AbstractDecisionEnsemble{O} +# output_producing_models::Vector{TO} +# weight_producing_models::Vector{TU} +# # aggregation::A +# # weights::W +# info::NamedTuple + +# function DecisionBag{O}( +# output_producing_models::Vector, +# weight_producing_models::Vector, +# # aggregation::Union{Nothing,Base.Callable}, +# # weights::Union{Nothing,AbstractVector}, +# info::NamedTuple = (;); +# suppress_parity_warning = nothing, +# ) where {O} +# @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" +# @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" +# @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." +# output_producing_models = wrap.(output_producing_models) +# weight_producing_models = wrap.(weight_producing_models) +# # if isnothing(aggregation) +# # # if a suppress_parity_warning parameter is provided, then the aggregation's suppress_parity_warning defaults to it; +# # # otherwise, it defaults to bestguess's suppress_parity_warning +# # if isnothing(suppress_parity_warning) +# # aggregation = function (args...; kwargs...) bestguess(args...; kwargs...) end +# # else +# # aggregation = function (args...; suppress_parity_warning = suppress_parity_warning, kwargs...) bestguess(args...; suppress_parity_warning, kwargs...) end +# # end +# # else +# # isnothing(suppress_parity_warning) || @warn "Unexpected value for suppress_parity_warning: $(suppress_parity_warning)." +# # end +# TO = typeof(output_producing_models) +# TU = typeof(weight_producing_models) +# # W = typeof(weights) +# # A = typeof(aggregation) +# new{O,TO,TU}(output_producing_models, weight_producing_models, aggregation, info) # , weights +# end - function MaxDecisionBag( - output_producing_models::Vector, - weight_producing_models::Vector, - args...; kwargs... - ) - @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" - @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" - @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." - output_producing_models = wrap.(output_producing_models) - weight_producing_models = wrap.(weight_producing_models) - O = Union{outcometype.(output_producing_models)...} - MaxDecisionBag{O}(output_producing_models, weight_producing_models, args...; kwargs...) - end -end +# function MaxDecisionBag( +# output_producing_models::Vector, +# weight_producing_models::Vector, +# args...; kwargs... +# ) +# @assert length(output_producing_models) > 0 "Cannot instantiate empty bagoutput-producing models!" +# @assert length(weight_producing_models) > 0 "Cannot instantiate empty bagweight-producing models!" +# @assert length(output_producing_models) == length(weight_producing_models) "Cannot instantiate bag with different numbers of output and weight producing models: $(length(output_producing_models)) != $(length(weight_producing_models))." +# output_producing_models = wrap.(output_producing_models) +# weight_producing_models = wrap.(weight_producing_models) +# O = Union{outcometype.(output_producing_models)...} +# MaxDecisionBag{O}(output_producing_models, weight_producing_models, args...; kwargs...) +# end +# end -isensemble(m::MaxDecisionBag) = true +# isensemble(m::MaxDecisionBag) = true -function apply(m::MaxDecisionBag, d::AbstractInterpretation; suppress_parity_warning = false, kwargs...) - weights = [apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models] - om = m.output_producing_models[argmax(weights)] - pred = apply(om, d; suppress_parity_warning, kwargs...) - # preds = [apply(om, d; suppress_parity_warning, kwargs...) for om in m.output_producing_models] - # pred = aggregation(m)(preds, weights; suppress_parity_warning) - pred -end +# function apply(m::MaxDecisionBag, d::AbstractInterpretation; suppress_parity_warning = false, kwargs...) +# weights = [apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models] +# om = m.output_producing_models[argmax(weights)] +# pred = apply(om, d; suppress_parity_warning, kwargs...) +# # preds = [apply(om, d; suppress_parity_warning, kwargs...) for om in m.output_producing_models] +# # pred = aggregation(m)(preds, weights; suppress_parity_warning) +# pred +# end -# TODO Add a keyword argument that toggles the soft or hard behavior. The hard behavior is one where you first find the bestguess among the weights, and then perform the apply only on the first +# # TODO Add a keyword argument that toggles the soft or hard behavior. The hard behavior is one where you first find the bestguess among the weights, and then perform the apply only on the first -# TODO parallelize -function apply( - m::MaxDecisionBag, - d::AbstractInterpretationSet; - suppress_parity_warning = false, - kwargs... -) - weights = hcat([apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) - preds = __apply_post(m, preds) - preds = [ - apply(m.output_producing_models[im], d; suppress_parity_warning, kwargs...) - for im in argmax(weights; dims=2) - ] - preds = __apply_pre(m, d, preds) - return preds -end +# # TODO parallelize +# function apply( +# m::MaxDecisionBag, +# d::AbstractInterpretationSet; +# suppress_parity_warning = false, +# kwargs... +# ) +# weights = hcat([apply(wm, d; suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) +# preds = __apply_post(m, preds) +# preds = [ +# apply(m.output_producing_models[im], d; suppress_parity_warning, kwargs...) +# for im in argmax(weights; dims=2) +# ] +# preds = __apply_pre(m, d, preds) +# return preds +# end -function apply!(m::MaxDecisionBag, d::AbstractInterpretationSet, y::AbstractVector; mode = :replace, leavesonly = false, suppress_parity_warning = false, kwargs...) - y = __apply_pre(m, d, y) - weights = hcat([apply!(wm, d, y; mode, leavesonly, suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) - preds = __apply_post(m, preds) - preds = [ - apply!(m.output_producing_models[im], d, y; mode, leavesonly, suppress_parity_warning, kwargs...) - for im in argmax(weights; dims=2) - ] - preds = __apply_pre(m, d, preds) - return __apply!(m, mode, preds, y, leavesonly) -end +# function apply!(m::MaxDecisionBag, d::AbstractInterpretationSet, y::AbstractVector; mode = :replace, leavesonly = false, suppress_parity_warning = false, kwargs...) +# y = __apply_pre(m, d, y) +# weights = hcat([apply!(wm, d, y; mode, leavesonly, suppress_parity_warning, kwargs...) for wm in m.weight_producing_models]...) +# preds = __apply_post(m, preds) +# preds = [ +# apply!(m.output_producing_models[im], d, y; mode, leavesonly, suppress_parity_warning, kwargs...) +# for im in argmax(weights; dims=2) +# ] +# preds = __apply_pre(m, d, preds) +# return __apply!(m, mode, preds, y, leavesonly) +# end -""" -TODO explain. The output of XGBoost via the strategy "multi:softmax". -""" -const MaxTreeBag{O,W<:RLabel,A<:typeof(+),WW<:RLabel} = MaxDecisionBag{O,ConstantModel{O},DecisionEnsemble{W,DecisionTree,A,WW}} +# """ +# TODO explain. The output of XGBoost via the strategy "multi:softmax". +# """ +# const MaxTreeBag{O,W<:RLabel,A<:typeof(+),WW<:RLabel} = MaxDecisionBag{O,ConstantModel{O},DecisionEnsemble{W,DecisionTree,A,WW}} -function unique_with_indices(x) - unique_vals = unique(x) - indices = [findall(==(val), x) for val in unique_vals] - return unique_vals, indices -end +# function unique_with_indices(x) +# unique_vals = unique(x) +# indices = [findall(==(val), x) for val in unique_vals] +# return unique_vals, indices +# end # function apply!( # dbag::SoleModels.DecisionBag, @@ -398,3 +394,143 @@ end # dbag.info.supporting_predictions = top_prediction # end +# ---------------------------------------------------------------------------- # +# DecisionXGBoost struct # +# ---------------------------------------------------------------------------- # +""" +A `DecisionXGBoost` is an ensemble of models, weighted by leaf values, exp.summed during apply. + +See also [`DecisionForest`](@ref), [`DecisionTree`](@ref), [`DecisionEnsemble`](@ref), [`MaxDecisionBag`](@ref). +""" +struct DecisionXGBoost{O,T<:AbstractModel,A<:Base.Callable} <: AbstractDecisionEnsemble{O} + models::Vector{T} + aggregation::A + info::NamedTuple + + function DecisionXGBoost{O}( + models::AbstractVector{T}, + aggregation::Union{Nothing,Base.Callable}, + info::NamedTuple = (;); + return_sum::Bool=false + ) where {O,T<:AbstractModel} + @assert length(models) > 0 "Cannot instantiate empty ensemble!" + models = wrap.(models) + + if isnothing(aggregation) + aggregation = function(args...; return_sum=false) bestguess(args...; return_sum) end + end + + A = typeof(aggregation) + new{O,T,A}(collect(models), aggregation, info) + end + + function DecisionXGBoost{O}( + models::AbstractVector; + kwargs... + ) where {O} + info = (;) + DecisionXGBoost{O}(models, nothing, info; kwargs...) + end + + function DecisionXGBoost{O}( + models::AbstractVector, + info::NamedTuple; + kwargs... + ) where {O} + DecisionXGBoost{O}(models, nothing, info; kwargs...) + end + + function DecisionXGBoost( + models::AbstractVector, + args...; kwargs... + ) + @assert length(models) > 0 "Cannot instantiate empty ensemble!" + models = wrap.(models) + O = Union{outcometype.(models)...} + DecisionXGBoost{O}(models, args...; kwargs...) + end +end + +isensemble(m::DecisionXGBoost) = true + +modelstype(m::DecisionXGBoost{O,T}) where {O,T} = T +models(m::DecisionXGBoost) = m.models +nmodels(m::DecisionXGBoost) = length(models(m)) + +aggregation(m::DecisionXGBoost) = m.aggregation +scored_aggregation(m::DecisionXGBoost) = aggregation(m) + +""" + function height(m::DecisionXGBoost) + +Return the maximum height across all the [`DecisionTree`](@ref)s within `m`. + +See also [`DecisionXGBoost`](@ref), `DecisionForest`](@ref), [`DecisionTree`](@ref). +""" +height(m::DecisionXGBoost) = subtreeheight(m) + +immediatesubmodels(m::DecisionXGBoost) = trees(m) +nimmediatesubmodels(m::DecisionXGBoost) = length(trees(m)) +listimmediaterules(m::DecisionXGBoost; kwargs...) = error("TODO implement") + +# ---------------------------------------------------------------------------- # +# DecisionXGBoost apply # +# ---------------------------------------------------------------------------- # +function apply( + m::DecisionXGBoost, + id::AbstractInterpretation; + suppress_parity_warning=false, + kwargs... +) + preds = [apply_leaf_scores(subm, d; suppress_parity_warning, kwargs...) for subm in models(m)] + preds = __apply_post(m, preds) + scored_aggregation(m)(preds, sort(unique(m.info.supporting_labels)); suppress_parity_warning) +end + +# TODO parallelize +function apply( + m::DecisionXGBoost, + d::AbstractInterpretationSet; + suppress_parity_warning=false, + kwargs... +) + # we expect X_test * classlabels * nrounds trees, because for every round, + # XGBoost for every round, creates a tree for every classlabel. + # So, in every subm model, we'll find as much trees as classlabels. + preds = hcat([apply_leaf_scores(subm, d; suppress_parity_warning, kwargs...) for subm in models(m)]...) + preds = __apply_post(m, preds) + preds = [ + scored_aggregation(m)(pred, sort(unique(m.info.supporting_labels))) + for pred in eachrow(preds) + ] + return preds +end + +# TODO parallelize +# function apply!( +# m::DecisionXGBoost, +# d::AbstractInterpretationSet, +# y::AbstractVector; +# mode = :replace, +# leavesonly = false, +# # show_progress = false, # length(ntrees(m)) > 15, +# suppress_parity_warning = false, +# kwargs... +# ) +# # @show y +# y = __apply_pre(m, d, y) +# # _d = SupportedLogiset(d) TODO? +# # @show y +# preds = hcat([apply!(subm, d, y; mode, leavesonly, kwargs...) for subm in models(m)]...) + +# preds = __apply_post(m, preds) + +# preds = [ +# weighted_aggregation(m)(preds[i,:]; suppress_parity_warning, kwargs...) +# for i in 1:size(preds,1) +# ] + +# preds = __apply_pre(m, d, preds) +# return __apply!(m, mode, preds, y, leavesonly) +# end + diff --git a/src/utils/models/leaf.jl b/src/utils/models/leaf.jl index 599f91a..d49101d 100644 --- a/src/utils/models/leaf.jl +++ b/src/utils/models/leaf.jl @@ -95,6 +95,43 @@ end convert(::Type{ConstantModel{O}}, o::O) where {O} = ConstantModel{O}(o) convert(::Type{<:AbstractModel{F}}, m::ConstantModel) where {F} = ConstantModel{F}(m) +# ---------------------------------------------------------------------------- # +# DecisionXGBoost apply # +# ---------------------------------------------------------------------------- # +outcome_leaf_value(m::ConstantModel) = m.info.leaf_values + +apply_leaf_scores(m::ConstantModel, i::AbstractInterpretation; kwargs...) = outcome(m) +apply_leaf_scores( + m::ConstantModel, + d::AbstractInterpretationSet, + i_instance::Integer; + kwargs... +) = (outcome(m), outcome_leaf_value(m)) +apply_leaf_scores( + m::ConstantModel, + d::AbstractInterpretationSet; + kwargs... +) = Fill((outcome(m), outcome_leaf_value(m)), ninstances(d)) + +function apply_leaf_scores!( + m::ConstantModel, + d::AbstractInterpretationSet, + y::AbstractVector; + mode = :replace, + leavesonly = false, + kwargs... +) + # @assert length(y) == ninstances(d) "$(length(y)) == $(ninstances(d))" + if mode == :replace + recursivelyemptysupports!(m, leavesonly) + mode = :append + end + + preds = fill((outcome(m), outcome_leaf_value(m)), ninstances(d)) + + return __apply!(m, mode, preds, y, leavesonly) +end + ############################################################################################ ################################### FunctionModel ########################################## ############################################################################################ diff --git a/src/utils/models/rule-and-branch.jl b/src/utils/models/rule-and-branch.jl index b845e31..e45df00 100644 --- a/src/utils/models/rule-and-branch.jl +++ b/src/utils/models/rule-and-branch.jl @@ -347,6 +347,35 @@ function apply( preds end +# ---------------------------------------------------------------------------- # +# DecisionXGBoost apply # +# ---------------------------------------------------------------------------- # +function apply_leaf_scores( + m::Branch, + d::AbstractInterpretationSet; + check_args::Tuple = (), + check_kwargs::NamedTuple = (;), + kwargs... +) + checkmask = checkantecedent(m, d, check_args...; check_kwargs...) + preds = Vector(undef,length(checkmask)) + preds[checkmask] .= apply_leaf_scores( + posconsequent(m), + slicedataset(d, checkmask; return_view = true, allow_no_instances = true); + check_args = check_args, + check_kwargs = check_kwargs, + kwargs... + ) + preds[(!).(checkmask)] .= apply_leaf_scores( + negconsequent(m), + slicedataset(d, (!).(checkmask); return_view = true, allow_no_instances = true); + check_args = check_args, + check_kwargs = check_kwargs, + kwargs... + ) + preds +end + function apply!( m::Branch, d::AbstractInterpretationSet, diff --git a/test/XgBoostExt/xgboost.jl b/test/XgBoostExt/xgboost.jl index b8d9a24..de05f1d 100644 --- a/test/XgBoostExt/xgboost.jl +++ b/test/XgBoostExt/xgboost.jl @@ -7,6 +7,7 @@ using DataFrames using MLJXGBoostInterface using SoleModels +import MLJModelInterface as MMI import XGBoost as XGB using Random, CategoricalArrays @@ -29,7 +30,7 @@ println("Test set type: ", typeof(X_test), " - ", typeof(y_test)) XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost model = XGTrees(; - num_round=10, + num_round=1, max_depth=6, objective="multi:softmax" ) @@ -41,28 +42,31 @@ fit!(mach) trees = XGB.trees(mach.fitresult[1]) -featurenames = mach.report.vals[1][1] -ds_safetest = vcat(y, "nothing") - +get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in MMI.classes(classes_seen)) +get_classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] +encoding = get_encoding(mach.fitresult[2]) +classlabels = get_classlabels(encoding) +featurenames = mach.report.vals[1].features +# ds_safetest = vcat(y_train, "nothing") -solem = solemodel(trees, Matrix(X), ds_safetest) -solem = solemodel(trees, Matrix(X), ds_safetest; featurenames) -solem = solemodel(trees, Matrix(X), ds_safetest; featurenames, keep_condensed = false) +# solem = solemodel(trees, Matrix(X_train), y_train) +solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) +solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, keep_condensed = false) @test SoleData.scalarlogiset(X_test; allow_propositional = true) isa PropositionalLogiset # Make test instances flow into the model preds = apply(solem, X_test) -preds2 = apply!(solem, X_test, y_test) +# preds2 = apply!(solem, X_test, y_test) -@test preds == preds2 +# @test preds == preds2 accuracy = sum(preds .== y_test)/length(y_test) -@test accuracy > 0.7 +@test accuracy > 0.9 # apply!(solem, X_test, y_test, mode = :append) -solem = @test_throws ErrorException solemodel(trees, Matrix(X), ds_safetest; featurenames, keep_condensed = true) -solem = @test_nowarn solemodel(trees, Matrix(X), ds_safetest; featurenames, keep_condensed = false) +solem = @test_throws ErrorException solemodel(trees, Matrix(X_train), y_train; classlabels, keep_condensed = true) +solem = @test_nowarn solemodel(trees, Matrix(X_train), y_train; classlabels, keep_condensed = false) printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) @@ -80,3 +84,43 @@ bst = XGB.xgboost( # obtain model predictions ŷ = XGB.predict(bst, X_test) +predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 +@test predsl == ŷ + +outperform = 0 +underperform = 0 +i = 0 + +for seed in 1:40 + rng = Xoshiro(seed) + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + for num_round in 10:10:100 + for eta in 0.1:0.1:0.9 + model = XGTrees(; num_round, eta, objective="multi:softmax") + mach = machine(model, X_train, y_train) + fit!(mach) + trees = XGB.trees(mach.fitresult[1]) + solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) + preds = apply(solem, X_test) + predsl = CategoricalArrays.levelcode.(categorical(preds)) + + yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 + bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") + ŷ = XGB.predict(bst, X_test) + + sole_accuracy = sum(predsl .== CategoricalArrays.levelcode.(categorical(y_test)))/length(y_test) + xgb_accuracy = sum(ŷ .== CategoricalArrays.levelcode.(categorical(y_test)) .- 1)/length(y_test) + + sole_accuracy > xgb_accuracy && global outperform += 1 + sole_accuracy < xgb_accuracy && global underperform += 1 + i += 1 + end + end +end + +@test outperform > underperform +println("SoleModel outperformed XGBoost in $outperform out of $i tests.") +println("SoleModel underperform XGBoost in $underperform out of $i tests.") + From 2ad644250ac864757f73c250b25fbc616bce5456 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 12 Apr 2025 00:23:10 +0200 Subject: [PATCH 11/44] xgboost float predict issue --- test/XgBoostExt/xgboost_predict_issue.jl | 446 +++++++++++++++++++++++ 1 file changed, 446 insertions(+) create mode 100644 test/XgBoostExt/xgboost_predict_issue.jl diff --git a/test/XgBoostExt/xgboost_predict_issue.jl b/test/XgBoostExt/xgboost_predict_issue.jl new file mode 100644 index 0000000..a91a66e --- /dev/null +++ b/test/XgBoostExt/xgboost_predict_issue.jl @@ -0,0 +1,446 @@ +using MLJ +using DataFrames +using MLJXGBoostInterface +import MLJModelInterface as MMI +using SoleModels +import XGBoost as XGB +using CategoricalArrays +using Random + +function predict_xgboost_bag(trees, X; n_classes=0, objective="binary:logistic") + n_samples = size(X, 1) + ntree_limit = length(trees) + n_classes == 0 && throw(ArgumentError("n_classes must be specified for multi-class predictions")) + + # Initialize predictions + if startswith(objective, "multi:softprob") || startswith(objective, "multi:softmax") + # For multi-class probabilities, we need a matrix + raw_preds = zeros(Float64, n_samples, n_classes) + else + # For binary and regression, a vector is sufficient + raw_preds = zeros(Float64, n_samples) + end + + # Iterate through trees and accumulate predictions + for i in 1:ntree_limit + tree = trees[i] + tree_preds = predict_tree(tree, X) + @show tree_preds + if startswith(objective, "multi:softprob") || startswith(objective, "multi:softmax") + # For multi-class softprob, each tree outputs predictions for a specific class + class_idx = (i - 1) % n_classes + 1 + raw_preds[:, class_idx] .+= tree_preds + @show class_idx + @show raw_preds + else + # For binary or regression, simply add the predictions + raw_preds .+= tree_preds + end + end + # Apply appropriate transformation based on objective + if objective == "binary:logistic" + # Apply sigmoid transformation + return 1.0 ./ (1.0 .+ exp.(-raw_preds)) + elseif objective == "multi:softprob" + # Apply softmax transformation + exp_preds = exp.(raw_preds) + row_sums = sum(exp_preds, dims=2) + @show exp_preds + @show row_sums + @show exp_preds ./ row_sums + return exp_preds ./ row_sums + elseif objective == "multi:softmax" + # Return class with highest score + if n_classes > 1 + _, indices = findmax(raw_preds, dims=2) + return [idx[2] for idx in indices] + else + return raw_preds .> 0 + end + elseif objective == "count:poisson" + # Apply exponential transformation for Poisson + return exp.(raw_preds) + else + # For regression or other objectives, return raw predictions + return raw_preds + end +end + +function predict_tree(tree, X) + n_samples = size(X, 1) + predictions = zeros(Float64, n_samples) + + for i in 1:n_samples + predictions[i] = traverse_tree(tree, X[i, :]) + end + return predictions +end + +function traverse_tree(tree, x) + # Start at root node + node = tree # Adjust based on your tree structure + + # Traverse until reaching a leaf + while !isempty(node.children) + # Get the split feature and value + feature_idx = node.split + split_value = node.split_condition + + # Decide which child to go to + if x[feature_idx] < split_value + node = node.children[1] + else + node = node.children[2] + end + end + # Return the leaf value + return node.leaf +end + +X, y = @load_iris +X = DataFrame(X) +train_ratio = 0.8 +seed, num_round, eta = 3, 1, 0.1 +rng = Xoshiro(seed) +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) +X_train, y_train = X[train, :], y[train] +X_test, y_test = X[test, :], y[test] + +XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost +model = XGTrees(; num_round, eta, objective="multi:softprob") +mach = machine(model, X_train, y_train) +fit!(mach) +# mlj_predict = predict(mach, DataFrame(X_test[27,:])) # WORKING +mlj_predict = predict(mach, DataFrame(X_test[28,:])) # NOT WORKING +trees = XGB.trees(mach.fitresult[1]) +get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in MMI.classes(classes_seen)) +get_classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] +encoding = get_encoding(mach.fitresult[2]) +classlabels = get_classlabels(encoding) +@show classlabels +featurenames = mach.report.vals[1].features +solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) +# preds = apply(solem, DataFrame(X_test[27,:])) # WORKING +preds = apply(solem, DataFrame(X_test[28,:])) # NOT WORKING +predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 + +yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 +bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softprob") +xtrs = XGB.trees(bst) +# yyy = XGB.predict(bst, DataFrame(X_test[27,:])) # WORKING +yyy = XGB.predict(bst, DataFrame(X_test[28,:])) # NOT WORKING + + +# # For multi-class classification +rename!(X_test, [:f0, :f1, :f2, :f3]) +# class_probs = predict_xgboost_bag(trees, DataFrame(X_test[27,:]); n_classes=3, objective="multi:softprob") # WORKING +class_probs = predict_xgboost_bag(trees, DataFrame(X_test[28,:]); n_classes=3, objective="multi:softprob") # NOT WORKING +class_preds = [argmax(probs) for probs in eachrow(class_probs)] .-1 + +isapprox(Float32.(class_probs), yyy, atol=1e-5) + +# # For regression +# reg_preds = predict_xgboost_bag(mtrs, X_test, objective="reg:squarederror") + +# num_round = 20 +# eta = 0.3 +# yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 +# bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") +# ŷ = XGB.predict(bst, X_test) + +### TREE 1 +""" +xtrs[1].cover = 53.3333282 +xtrs[1].gain = 55.7546806 +xtrs[1].nmissing = 2 +xtrs[1].yes = 1 +xtrs[1].no = 2 +xtrs[1].split = "petal_length" +xtrs[1].split_condition = 3.0 + +xtrs[1].children[1].cover = 16.8888874 +xtrs[1].children[1].id = 1 +xtrs[1].children[1].leaf = 0.141614899 + +xtrs[1].children[2].cover = 36.4444427 +xtrs[1].children[2].id = 2 +xtrs[1].children[2].leaf = -0.072997041 + +solem.models[1].info = +(leaf_values = [0.141614899, -0.072997041], + supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) + + sole.models[1].antecedent = +Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 3.0 + +solem.models[1].posconsequent.outcome = CategoricalValue{String, UInt32} "setosa" +solem.models[1].posconsequent.info = +(leaf_values = 0.141614899, + supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + +solem.models[1].negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" + solem.models[1].negconsequent.info = +(leaf_values = -0.072997041, + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor"],) +""" + +### TREE 2 +""" +xtrs[2].cover = 53.3333282 +xtrs[2].gain = 11.9339008 +xtrs[2].nmissing = 2 +xtrs[2].yes = 1 +xtrs[2].no = 2 +xtrs[2].split = "petal_length" +xtrs[2].split_condition = 3.0 + + xtrs[2]children[1].cover = 16.8888874 + xtrs[2]children[1].id = 1 + xtrs[2]children[1].leaf = -0.070807457 + + xtrs[2].children[2].cover = 36.4444427 + xtrs[2].children[2].gain = 35.383049 + xtrs[2].children[2].nmissing = 4 + xtrs[2].children[2].yes = 3 + xtrs[2].children[2].no = 4 + xtrs[2].children[2].split = "petal_length" + xtrs[2].children[2].split_condition = 4.9000001 + + xtrs[2].children[2].children[1].cover = 17.7777767 + xtrs[2].children[2].children[1].gain = 4.09395218 + xtrs[2].children[2].children[1].nmissing = 6 + xtrs[2].children[2].children[1].yes = 5 + xtrs[2].children[2].children[1].no = 6 + xtrs[2].children[2].children[1].split = "petal_width" + xtrs[2].children[2].children[1].split_condition = 1.70000005 + + xtrs[2].children[2].children[1].children[1].cover = 15.999999 + xtrs[2].children[2].children[1].children[1].id = 5 + xtrs[2].children[2].children[1].children[1].leaf = 0.141176477 + + xtrs[2].children[2].children[1].children[2].cover = 1.77777767 + xtrs[2].children[2].children[1].children[2].id = 6 + xtrs[2].children[2].children[1].children[2].leaf = -0.0120000029 + + xtrs[2].children[2].children[2].cover = 18.666666 + xtrs[2].children[2].children[2].gain = 0.264455795 + xtrs[2].children[2].children[2].nmissing = 8 + xtrs[2].children[2].children[2].yes = 7 + xtrs[2].children[2].children[2].no = 8 + xtrs[2].children[2].children[2].split = "petal_width" + xtrs[2].children[2].children[2].split_condition = 1.70000005 + + xtrs[2].children[2].children[2].children[1].cover = 2.22222209 + xtrs[2].children[2].children[2].children[1].id = 7 + xtrs[2].children[2].children[2].children[1].leaf = -0.0206896588 + + xtrs[2].children[2].children[2].children[2].cover = 16.4444427 + xtrs[2].children[2].children[2].children[2].id = 8 + xtrs[2].children[2].children[2].children[2].leaf = -0.0707006454 + +solem.models[2].info = +(leaf_values = [-0.070807457, 0.141176477, -0.0120000029, -0.0206896588, -0.0707006454], + supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) +solem.models[2].antecedent = +Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 3.0 + + solem.models[2].posconsequent.outcome = CategoricalValue{String, UInt32} "setosa" + solem.models[2].posconsequent.info = + (leaf_values = -0.070807457, + supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + + solem.models[2].negconsequent.antecedent = + Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 4.9000001 + solem.models[2].negconsequent.info = + (leaf_values = [0.141176477, -0.0120000029, -0.0206896588, -0.0707006454], + supporting_predictions = CategoricalValue{String, UInt32}["versicolor", "versicolor", "versicolor", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) + + solem.models[2].negconsequent.posconsequent.antecedent = + Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_width] < 1.70000005 + solem.models[2].negconsequent.posconsequent.info = + (leaf_values = [0.141176477, -0.0120000029], + supporting_predictions = CategoricalValue{String, UInt32}["versicolor", "versicolor", "versicolor", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) + + solem.models[2].negconsequent.posconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "versicolor" + solem.models[2].negconsequent.posconsequent.posconsequent.info = + (leaf_values = 0.141176477, + supporting_predictions = CategoricalValue{String, UInt32}["versicolor", "versicolor", "versicolor"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + + solem.models[2].negconsequent.posconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" + solem.models[2].negconsequent.posconsequent.negconsequent.info = + (leaf_values = -0.0120000029, + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + + solem.models[2].negconsequent.negconsequent.antecedent = + Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_width] < 1.70000005 + solem.models[2].negconsequent.negconsequent.info = + (leaf_values = [-0.0206896588, -0.0707006454], + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) + + solem.models[2].negconsequent.negconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "virginica" + solem.models[2].negconsequent.negconsequent.posconsequent.info = + (leaf_values = -0.0206896588, + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + + solem.models[2].negconsequent.negconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" + solem.models[2].negconsequent.negconsequent.negconsequent.info = + (leaf_values = -0.0707006454, + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor"],) +""" + +### TREE 3 +""" +xtrs[3].cover = 53.3333282 +xtrs[3].gain = 51.9276886 +xtrs[3].nmissing = 2 +xtrs[3].yes = 1 +xtrs[3].no = 2 +xtrs[3].split = "petal_length" +xtrs[3].split_condition = 4.80000019 + + xtrs[3].children[1].cover = 32.8888855 + xtrs[3].children[1].gain = 0.676908493 + xtrs[3].children[1].nmissing = 4 + xtrs[3].children[1].yes = 3 + xtrs[3].children[1].no = 4 + xtrs[3].children[1].split = "petal_width" + xtrs[3].children[1].split_condition = 1.60000002 + + xtrs[3].children[1].children[1].cover = 31.5555534 + xtrs[3].children[1].children[1].id = 3 + xtrs[3].children[1].children[1].leaf = -0.0726962537 + + xtrs[3].children[1].children[2].cover = 1.33333325 + xtrs[3].children[1].children[2].id = 4 + xtrs[3].children[1].children[2].leaf = -2.55448485e-9 + + xtrs[3].children[2].cover = 20.4444427 + xtrs[3].children[2].gain = 1.53349686 + xtrs[3].children[2].nmissing = 6 + xtrs[3].children[2].yes = 5 + xtrs[3].children[2].no = 6 + xtrs[3].children[2].split = "petal_length" + xtrs[3].children[2].split_condition = 4.9000001 + + xtrs[3].children[2].children[1].cover = 1.77777767 + xtrs[3].children[2].children[1].id = 5 + xtrs[3].children[2].children[1].leaf = 0.0239999983 + + xtrs[3].children[2].children[2].cover = 18.666666 + xtrs[3].children[2].children[2].id = 6 + xtrs[3].children[2].children[2].leaf = 0.137288138 + +solem.models[3].info = +(leaf_values = [-0.0726962537, -2.55448485e-9, 0.0239999983, 0.137288138], + supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) +solem.models[3].antecedent = Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 4.80000019 + + solem.models[3].posconsequent.info = + (leaf_values = [-0.0726962537, -2.55448485e-9], + supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) + solem.models[3].posconsequent.antecedent = Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_width] < 1.60000002 + + solem.models[3].posconsequent.posconsequent.info + (leaf_values = -0.0726962537, + supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + solem.models[3].posconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "setosa" + + solem.models[3].posconsequent.negconsequent.info = + (leaf_values = -2.55448485e-9, + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + solem.models[3].posconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" + + solem.models[3].negconsequent.info = + (leaf_values = [0.0239999983, 0.137288138], + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) + solem.models[3].negconsequent.antecedent = Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 4.9000001 + + solem.models[3].negconsequent.posconsequent.info = + (leaf_values = 0.0239999983, + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + solem.models[3].negconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "virginica" + + solem.models[3].negconsequent.negconsequent.info = + (leaf_values = 0.137288138, + supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], + supporting_labels = ["setosa", "virginica", "versicolor"],) + solem.models[3].negconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" +""" + +# calculating the probabilities + +# Row │ sepal_length sepal_width petal_length petal_width +# │ Float64 Float64 Float64 Float64 +# ─────┼────────────────────────────────────────────────────── +# 1 │ 6.9 3.1 4.9 1.5 + +### TREE 1: probability of setosa +""" +"petal_length" < 3.0 -- no >> leaf = -0.072997041 +""" + +### TREE 2: probability of versicolor +""" +"petal_length" < 3.0 -- no > "petal_length" < 4.9000001 -- yes > "petal_width" < 1.70000005 -- yes >> leaf = 0.141176477 +""" + +### TREE 1: probability of virginica +""" +"petal_length" < 4.80000019 -- no > "petal_length" < 4.9000001 -- yes >> leaf = 0.0239999983 +""" + +### calculatin multi:softprob +""" +exp_preds = exp.(-0.072997041 0.141176477 0.0239999983) = 0.929604 1.15163 1.02429 +row_sums = sum(exp_preds, dims=2) = 3.1055217627515077 +probability = exp_preds / row_sums = 0.299339 0.370832 0.329829 + +XGBoost probability: 0.304161 0.320495 0.375344 +""" + +""" +### ragionamento per assurdo: problema di arrotondamento ### +"petal_length" = 4.9 +"petal_length" < 4.9000001 viene valutato come false + +quindi: +# tree 2 +"petal_length" < 3.0 -- no > "petal_length" < 4.9000001 -- no > "petal_width" < 1.70000005 -- yes >> leaf = -0.0206896588 +# tree 3 +"petal_length" < 4.80000019 -- no > "petal_length" < 4.9000001 -- no >> leaf = 0.137288138 +""" +exp_preds = exp.([-0.072997041, -0.0206896588, 0.137288138]) +row_sums = sum(exp_preds) +probability = exp_preds ./ row_sums + +""" +# 3-element Vector{Float64}: +# 0.3041612750760762 +# 0.320494608175597 +# 0.3753441167483268 + +# XGBoost probability: 0.304161 0.320495 0.375344 + +PROBLEMA RISOLTO +se si valuta +4.9 < 4.9000001 false +allora si ottiene il risaultato del predict XGBoost +""" \ No newline at end of file From dd2350190d8e86b40321f33b24ae115d9ba5802d Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Mon, 14 Apr 2025 15:23:34 +0200 Subject: [PATCH 12/44] xgboost predict found areas requireing fixes --- ext/XGBoostExt.jl | 16 +++++++++-- test/XgBoostExt/xgboost.jl | 25 +++++++++++++++- test/XgBoostExt/xgboost_predict_issue.jl | 36 ++++++++++++++++++------ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index 78d7031..22cf984 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -115,7 +115,9 @@ end function early_return(leaf, antecedent, clabel, classl) info =(; - leaf_values = leaf, + # leaf_values = leaf, + ### debug convert to Float32 TODO delete + leaf_values = Float32(leaf), supporting_predictions = clabel, supporting_labels = [classl], ) @@ -150,6 +152,8 @@ function SoleModels.solemodel( # xgboost trees could be composed of only one leaf, without any split if isnothing(t.split) antecedent = Atom(get_condition(class_idx, featurenames; test_operator=(<), featval=Inf)) + ### debug different test_operator TODO delete + # antecedent = Atom(get_condition(class_idx, featurenames; test_operator=(≤), featval=Inf)) early_return(t.leaf, antecedent, clabels, classlabels[class_idx]) else SoleModels.solemodel(t, X, y; classlabels, featurenames, class_idx, clabels, kwargs...) @@ -186,14 +190,20 @@ function SoleModels.solemodel( clabels ) antecedent = Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<))) + ### debug different test_operator TODO delete + # antecedent = Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≤))) # create a new path for the left branch left_path = copy(path_conditions) push!(left_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<)))) + ### debug different test_operator TODO delete + # push!(left_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≤)))) # create a new path for the right branch right_path = copy(path_conditions) push!(right_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≥)))) + ### debug different test_operator TODO delete + # push!(right_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(>)))) lefttree = if isnothing(tree.children[1].split) # @show SoleModels.join_antecedents(left_path) @@ -238,7 +248,9 @@ function xgbleaf( isnothing(prediction) && return nothing info = (; - leaf_values = leaf.leaf, + # leaf_values = leaf.leaf, + ### debug convert to Float32 TODO delete + leaf_values = Float32(leaf.leaf), supporting_predictions = fill(prediction, length(labels)), supporting_labels = labels, ) diff --git a/test/XgBoostExt/xgboost.jl b/test/XgBoostExt/xgboost.jl index de05f1d..9a1ef99 100644 --- a/test/XgBoostExt/xgboost.jl +++ b/test/XgBoostExt/xgboost.jl @@ -90,6 +90,7 @@ predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 outperform = 0 underperform = 0 i = 0 +j = 0 for seed in 1:40 rng = Xoshiro(seed) @@ -110,17 +111,39 @@ for seed in 1:40 bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") ŷ = XGB.predict(bst, X_test) + (predsl .-1) != ŷ && global j += 1 + sole_accuracy = sum(predsl .== CategoricalArrays.levelcode.(categorical(y_test)))/length(y_test) xgb_accuracy = sum(ŷ .== CategoricalArrays.levelcode.(categorical(y_test)) .- 1)/length(y_test) sole_accuracy > xgb_accuracy && global outperform += 1 sole_accuracy < xgb_accuracy && global underperform += 1 - i += 1 + global i += 1 end end end @test outperform > underperform +println("Different predictions: $j out of $i tests.") println("SoleModel outperformed XGBoost in $outperform out of $i tests.") println("SoleModel underperform XGBoost in $underperform out of $i tests.") +println("Total different accuracy: ", outperform + underperform) + +""" +Test with original test operator = < +Different predictions: 1051 out of 3600 tests. +SoleModel outperformed XGBoost in 744 out of 3600 tests. +SoleModel underperform XGBoost in 231 out of 3600 tests. +Total different accuracy: 975 +""" + +""" +Test with custom test operator = <= +Different predictions: 1538 out of 3600 tests. +SoleModel outperformed XGBoost in 1231 out of 3600 tests. +SoleModel underperform XGBoost in 202 out of 3600 tests. +Total different accuracy: 1433 + +da notare come è aumentata l'accuratezza. Ma comunque stiamo lavorando su iris, magari è un caso isolato +""" diff --git a/test/XgBoostExt/xgboost_predict_issue.jl b/test/XgBoostExt/xgboost_predict_issue.jl index a91a66e..5a73a29 100644 --- a/test/XgBoostExt/xgboost_predict_issue.jl +++ b/test/XgBoostExt/xgboost_predict_issue.jl @@ -7,6 +7,13 @@ import XGBoost as XGB using CategoricalArrays using Random +# References: +# https://github.com/chengjunhou/xgb2sql/issues/1 +# https://xgboost.readthedocs.io/en/latest/R-package/xgboostfromJSON.html + +# per me +# https://xgboost.readthedocs.io/en/latest/build.html + function predict_xgboost_bag(trees, X; n_classes=0, objective="binary:logistic") n_samples = size(X, 1) ntree_limit = length(trees) @@ -15,10 +22,10 @@ function predict_xgboost_bag(trees, X; n_classes=0, objective="binary:logistic") # Initialize predictions if startswith(objective, "multi:softprob") || startswith(objective, "multi:softmax") # For multi-class probabilities, we need a matrix - raw_preds = zeros(Float64, n_samples, n_classes) + raw_preds = zeros(Float32, n_samples, n_classes) else # For binary and regression, a vector is sufficient - raw_preds = zeros(Float64, n_samples) + raw_preds = zeros(Float32, n_samples) end # Iterate through trees and accumulate predictions @@ -45,9 +52,9 @@ function predict_xgboost_bag(trees, X; n_classes=0, objective="binary:logistic") # Apply softmax transformation exp_preds = exp.(raw_preds) row_sums = sum(exp_preds, dims=2) - @show exp_preds - @show row_sums - @show exp_preds ./ row_sums + @show typeof(exp_preds) + @show typeof(row_sums) + @show typeof(exp_preds ./ row_sums) return exp_preds ./ row_sums elseif objective == "multi:softmax" # Return class with highest score @@ -68,11 +75,12 @@ end function predict_tree(tree, X) n_samples = size(X, 1) - predictions = zeros(Float64, n_samples) + predictions = zeros(Float32, n_samples) for i in 1:n_samples predictions[i] = traverse_tree(tree, X[i, :]) end + @show typeof(predictions) return predictions end @@ -84,9 +92,11 @@ function traverse_tree(tree, x) while !isempty(node.children) # Get the split feature and value feature_idx = node.split - split_value = node.split_condition + split_value = Float32(node.split_condition) # Decide which child to go to + @show typeof(x[feature_idx]) + @show typeof(split_value) if x[feature_idx] < split_value node = node.children[1] else @@ -94,7 +104,8 @@ function traverse_tree(tree, x) end end # Return the leaf value - return node.leaf + @show typeof(node.leaf) + return Float32(node.leaf) end X, y = @load_iris @@ -137,7 +148,14 @@ rename!(X_test, [:f0, :f1, :f2, :f3]) class_probs = predict_xgboost_bag(trees, DataFrame(X_test[28,:]); n_classes=3, objective="multi:softprob") # NOT WORKING class_preds = [argmax(probs) for probs in eachrow(class_probs)] .-1 -isapprox(Float32.(class_probs), yyy, atol=1e-5) +X_train32 = DataFrame(Float32.(Matrix(X_train)), [:f0, :f1, :f2, :f3]) +bst32 = XGB.xgboost((X_train32, yl_train); num_round, eta, num_class=3, objective="multi:softprob") +xtrs32 = XGB.trees(bst32) +X_test32 = DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), [:f0, :f1, :f2, :f3]) +class_probs = predict_xgboost_bag(xtrs32, X_test32; n_classes=3, objective="multi:softprob") # NOT WORKING + + +# isapprox(Float32.(class_probs), yyy, atol=1e-5) # appunto per ricordarsi "atol" # # For regression # reg_preds = predict_xgboost_bag(mtrs, X_test, objective="reg:squarederror") From 93fba4de571cd8e6732c36025c34625f4b521ff0 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Mon, 14 Apr 2025 22:36:39 +0200 Subject: [PATCH 13/44] XGBoost test passed --- ext/XGBoostExt.jl | 45 ++- test/XgBoostExt/xgboost.jl | 44 +-- test/XgBoostExt/xgboost_predict_issue.jl | 339 +---------------------- 3 files changed, 39 insertions(+), 389 deletions(-) diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index 22cf984..5e58c62 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -140,6 +140,7 @@ function SoleModels.solemodel( classlabels, featurenames=nothing, keep_condensed=false, + use_float32::Bool=true, kwargs... ) keep_condensed && error("Cannot keep condensed XGBoost.Node.") @@ -152,11 +153,10 @@ function SoleModels.solemodel( # xgboost trees could be composed of only one leaf, without any split if isnothing(t.split) antecedent = Atom(get_condition(class_idx, featurenames; test_operator=(<), featval=Inf)) - ### debug different test_operator TODO delete - # antecedent = Atom(get_condition(class_idx, featurenames; test_operator=(≤), featval=Inf)) - early_return(t.leaf, antecedent, clabels, classlabels[class_idx]) + leaf = use_float32 ? Float32(t.leaf) : t.leaf + early_return(leaf, antecedent, clabels, classlabels[class_idx]) else - SoleModels.solemodel(t, X, y; classlabels, featurenames, class_idx, clabels, kwargs...) + SoleModels.solemodel(t, X, y; classlabels, class_idx, clabels, featurenames, use_float32, kwargs...) end end @@ -184,32 +184,28 @@ function SoleModels.solemodel( X::AbstractMatrix, y::AbstractVector; classlabels, - path_conditions=Formula[], - featurenames=nothing, class_idx, - clabels + clabels, + featurenames=nothing, + path_conditions=Formula[], + use_float32::Bool, ) - antecedent = Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<))) - ### debug different test_operator TODO delete - # antecedent = Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≤))) +split_condition = use_float32 ? Float32(tree.split_condition) : tree.split_condition + antecedent = Atom(get_condition(tree.split, split_condition, featurenames; test_operator=(<))) # create a new path for the left branch left_path = copy(path_conditions) - push!(left_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(<)))) - ### debug different test_operator TODO delete - # push!(left_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≤)))) + push!(left_path, Atom(get_condition(tree.split, split_condition, featurenames; test_operator=(<)))) # create a new path for the right branch right_path = copy(path_conditions) - push!(right_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(≥)))) - ### debug different test_operator TODO delete - # push!(right_path, Atom(get_condition(tree.split, tree.split_condition, featurenames; test_operator=(>)))) + push!(right_path, Atom(get_condition(tree.split, split_condition, featurenames; test_operator=(≥)))) lefttree = if isnothing(tree.children[1].split) # @show SoleModels.join_antecedents(left_path) - xgbleaf(tree.children[1], left_path, X, y) + xgbleaf(tree.children[1], left_path, X, y; use_float32) else - SoleModels.solemodel(tree.children[1], X, y; path_conditions=left_path, classlabels, class_idx, clabels,featurenames) + SoleModels.solemodel(tree.children[1], X, y; path_conditions=left_path, classlabels, class_idx, clabels, featurenames, use_float32) end isnothing(lefttree) && begin @@ -218,9 +214,9 @@ function SoleModels.solemodel( righttree = if isnothing(tree.children[2].split) # @show SoleModels.join_antecedents(right_path) - xgbleaf(tree.children[2], right_path, X, y) + xgbleaf(tree.children[2], right_path, X, y; use_float32) else - SoleModels.solemodel(tree.children[2], X, y; path_conditions=right_path, classlabels, class_idx, clabels, featurenames) + SoleModels.solemodel(tree.children[2], X, y; path_conditions=right_path, classlabels, class_idx, clabels, featurenames, use_float32) end isnothing(righttree) && begin @@ -239,7 +235,8 @@ function xgbleaf( leaf::XGBoost.Node, formula::Vector{<:Formula}, X::AbstractMatrix, - y::AbstractVector + y::AbstractVector; + use_float32::Bool, ) bitX = bitmap_check_conditions(X, formula) prediction = SoleModels.bestguess(y[bitX]; suppress_parity_warning=true) @@ -247,10 +244,10 @@ function xgbleaf( isnothing(prediction) && return nothing + leaf_values = use_float32 ? Float32(leaf.leaf) : leaf.leaf + info = (; - # leaf_values = leaf.leaf, - ### debug convert to Float32 TODO delete - leaf_values = Float32(leaf.leaf), + leaf_values, supporting_predictions = fill(prediction, length(labels)), supporting_labels = labels, ) diff --git a/test/XgBoostExt/xgboost.jl b/test/XgBoostExt/xgboost.jl index 9a1ef99..84983b1 100644 --- a/test/XgBoostExt/xgboost.jl +++ b/test/XgBoostExt/xgboost.jl @@ -87,11 +87,6 @@ ŷ = XGB.predict(bst, X_test) predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 @test predsl == ŷ -outperform = 0 -underperform = 0 -i = 0 -j = 0 - for seed in 1:40 rng = Xoshiro(seed) train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) @@ -103,47 +98,16 @@ for seed in 1:40 mach = machine(model, X_train, y_train) fit!(mach) trees = XGB.trees(mach.fitresult[1]) - solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) - preds = apply(solem, X_test) + solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=true) + X_test_f32 = mapcols(col -> Float32.(col), X_test) + preds = apply(solem, X_test_f32) predsl = CategoricalArrays.levelcode.(categorical(preds)) yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") ŷ = XGB.predict(bst, X_test) - (predsl .-1) != ŷ && global j += 1 - - sole_accuracy = sum(predsl .== CategoricalArrays.levelcode.(categorical(y_test)))/length(y_test) - xgb_accuracy = sum(ŷ .== CategoricalArrays.levelcode.(categorical(y_test)) .- 1)/length(y_test) - - sole_accuracy > xgb_accuracy && global outperform += 1 - sole_accuracy < xgb_accuracy && global underperform += 1 - global i += 1 + @test (predsl .-1) == ŷ end end end - -@test outperform > underperform -println("Different predictions: $j out of $i tests.") -println("SoleModel outperformed XGBoost in $outperform out of $i tests.") -println("SoleModel underperform XGBoost in $underperform out of $i tests.") -println("Total different accuracy: ", outperform + underperform) - -""" -Test with original test operator = < -Different predictions: 1051 out of 3600 tests. -SoleModel outperformed XGBoost in 744 out of 3600 tests. -SoleModel underperform XGBoost in 231 out of 3600 tests. -Total different accuracy: 975 -""" - -""" -Test with custom test operator = <= -Different predictions: 1538 out of 3600 tests. -SoleModel outperformed XGBoost in 1231 out of 3600 tests. -SoleModel underperform XGBoost in 202 out of 3600 tests. -Total different accuracy: 1433 - -da notare come è aumentata l'accuratezza. Ma comunque stiamo lavorando su iris, magari è un caso isolato -""" - diff --git a/test/XgBoostExt/xgboost_predict_issue.jl b/test/XgBoostExt/xgboost_predict_issue.jl index 5a73a29..e6d0a9f 100644 --- a/test/XgBoostExt/xgboost_predict_issue.jl +++ b/test/XgBoostExt/xgboost_predict_issue.jl @@ -32,13 +32,11 @@ function predict_xgboost_bag(trees, X; n_classes=0, objective="binary:logistic") for i in 1:ntree_limit tree = trees[i] tree_preds = predict_tree(tree, X) - @show tree_preds + if startswith(objective, "multi:softprob") || startswith(objective, "multi:softmax") # For multi-class softprob, each tree outputs predictions for a specific class class_idx = (i - 1) % n_classes + 1 raw_preds[:, class_idx] .+= tree_preds - @show class_idx - @show raw_preds else # For binary or regression, simply add the predictions raw_preds .+= tree_preds @@ -52,9 +50,6 @@ function predict_xgboost_bag(trees, X; n_classes=0, objective="binary:logistic") # Apply softmax transformation exp_preds = exp.(raw_preds) row_sums = sum(exp_preds, dims=2) - @show typeof(exp_preds) - @show typeof(row_sums) - @show typeof(exp_preds ./ row_sums) return exp_preds ./ row_sums elseif objective == "multi:softmax" # Return class with highest score @@ -80,7 +75,6 @@ function predict_tree(tree, X) for i in 1:n_samples predictions[i] = traverse_tree(tree, X[i, :]) end - @show typeof(predictions) return predictions end @@ -95,8 +89,6 @@ function traverse_tree(tree, x) split_value = Float32(node.split_condition) # Decide which child to go to - @show typeof(x[feature_idx]) - @show typeof(split_value) if x[feature_idx] < split_value node = node.children[1] else @@ -104,7 +96,6 @@ function traverse_tree(tree, x) end end # Return the leaf value - @show typeof(node.leaf) return Float32(node.leaf) end @@ -121,18 +112,25 @@ XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost model = XGTrees(; num_round, eta, objective="multi:softprob") mach = machine(model, X_train, y_train) fit!(mach) -# mlj_predict = predict(mach, DataFrame(X_test[27,:])) # WORKING -mlj_predict = predict(mach, DataFrame(X_test[28,:])) # NOT WORKING +# mlj_predict = predict(mach, DataFrame(X_test[27,:])) +mlj_predict = predict(mach, DataFrame(X_test[28,:])) + trees = XGB.trees(mach.fitresult[1]) get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in MMI.classes(classes_seen)) get_classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] encoding = get_encoding(mach.fitresult[2]) classlabels = get_classlabels(encoding) -@show classlabels featurenames = mach.report.vals[1].features + +solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=false) +preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # NOT WORKING + +solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=true) +preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # WORKING + solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) -# preds = apply(solem, DataFrame(X_test[27,:])) # WORKING -preds = apply(solem, DataFrame(X_test[28,:])) # NOT WORKING +preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # WORKING + predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 @@ -152,313 +150,4 @@ X_train32 = DataFrame(Float32.(Matrix(X_train)), [:f0, :f1, :f2, :f3]) bst32 = XGB.xgboost((X_train32, yl_train); num_round, eta, num_class=3, objective="multi:softprob") xtrs32 = XGB.trees(bst32) X_test32 = DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), [:f0, :f1, :f2, :f3]) -class_probs = predict_xgboost_bag(xtrs32, X_test32; n_classes=3, objective="multi:softprob") # NOT WORKING - - -# isapprox(Float32.(class_probs), yyy, atol=1e-5) # appunto per ricordarsi "atol" - -# # For regression -# reg_preds = predict_xgboost_bag(mtrs, X_test, objective="reg:squarederror") - -# num_round = 20 -# eta = 0.3 -# yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 -# bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") -# ŷ = XGB.predict(bst, X_test) - -### TREE 1 -""" -xtrs[1].cover = 53.3333282 -xtrs[1].gain = 55.7546806 -xtrs[1].nmissing = 2 -xtrs[1].yes = 1 -xtrs[1].no = 2 -xtrs[1].split = "petal_length" -xtrs[1].split_condition = 3.0 - -xtrs[1].children[1].cover = 16.8888874 -xtrs[1].children[1].id = 1 -xtrs[1].children[1].leaf = 0.141614899 - -xtrs[1].children[2].cover = 36.4444427 -xtrs[1].children[2].id = 2 -xtrs[1].children[2].leaf = -0.072997041 - -solem.models[1].info = -(leaf_values = [0.141614899, -0.072997041], - supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) - - sole.models[1].antecedent = -Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 3.0 - -solem.models[1].posconsequent.outcome = CategoricalValue{String, UInt32} "setosa" -solem.models[1].posconsequent.info = -(leaf_values = 0.141614899, - supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - -solem.models[1].negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" - solem.models[1].negconsequent.info = -(leaf_values = -0.072997041, - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor"],) -""" - -### TREE 2 -""" -xtrs[2].cover = 53.3333282 -xtrs[2].gain = 11.9339008 -xtrs[2].nmissing = 2 -xtrs[2].yes = 1 -xtrs[2].no = 2 -xtrs[2].split = "petal_length" -xtrs[2].split_condition = 3.0 - - xtrs[2]children[1].cover = 16.8888874 - xtrs[2]children[1].id = 1 - xtrs[2]children[1].leaf = -0.070807457 - - xtrs[2].children[2].cover = 36.4444427 - xtrs[2].children[2].gain = 35.383049 - xtrs[2].children[2].nmissing = 4 - xtrs[2].children[2].yes = 3 - xtrs[2].children[2].no = 4 - xtrs[2].children[2].split = "petal_length" - xtrs[2].children[2].split_condition = 4.9000001 - - xtrs[2].children[2].children[1].cover = 17.7777767 - xtrs[2].children[2].children[1].gain = 4.09395218 - xtrs[2].children[2].children[1].nmissing = 6 - xtrs[2].children[2].children[1].yes = 5 - xtrs[2].children[2].children[1].no = 6 - xtrs[2].children[2].children[1].split = "petal_width" - xtrs[2].children[2].children[1].split_condition = 1.70000005 - - xtrs[2].children[2].children[1].children[1].cover = 15.999999 - xtrs[2].children[2].children[1].children[1].id = 5 - xtrs[2].children[2].children[1].children[1].leaf = 0.141176477 - - xtrs[2].children[2].children[1].children[2].cover = 1.77777767 - xtrs[2].children[2].children[1].children[2].id = 6 - xtrs[2].children[2].children[1].children[2].leaf = -0.0120000029 - - xtrs[2].children[2].children[2].cover = 18.666666 - xtrs[2].children[2].children[2].gain = 0.264455795 - xtrs[2].children[2].children[2].nmissing = 8 - xtrs[2].children[2].children[2].yes = 7 - xtrs[2].children[2].children[2].no = 8 - xtrs[2].children[2].children[2].split = "petal_width" - xtrs[2].children[2].children[2].split_condition = 1.70000005 - - xtrs[2].children[2].children[2].children[1].cover = 2.22222209 - xtrs[2].children[2].children[2].children[1].id = 7 - xtrs[2].children[2].children[2].children[1].leaf = -0.0206896588 - - xtrs[2].children[2].children[2].children[2].cover = 16.4444427 - xtrs[2].children[2].children[2].children[2].id = 8 - xtrs[2].children[2].children[2].children[2].leaf = -0.0707006454 - -solem.models[2].info = -(leaf_values = [-0.070807457, 0.141176477, -0.0120000029, -0.0206896588, -0.0707006454], - supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) -solem.models[2].antecedent = -Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 3.0 - - solem.models[2].posconsequent.outcome = CategoricalValue{String, UInt32} "setosa" - solem.models[2].posconsequent.info = - (leaf_values = -0.070807457, - supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - - solem.models[2].negconsequent.antecedent = - Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 4.9000001 - solem.models[2].negconsequent.info = - (leaf_values = [0.141176477, -0.0120000029, -0.0206896588, -0.0707006454], - supporting_predictions = CategoricalValue{String, UInt32}["versicolor", "versicolor", "versicolor", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) - - solem.models[2].negconsequent.posconsequent.antecedent = - Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_width] < 1.70000005 - solem.models[2].negconsequent.posconsequent.info = - (leaf_values = [0.141176477, -0.0120000029], - supporting_predictions = CategoricalValue{String, UInt32}["versicolor", "versicolor", "versicolor", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) - - solem.models[2].negconsequent.posconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "versicolor" - solem.models[2].negconsequent.posconsequent.posconsequent.info = - (leaf_values = 0.141176477, - supporting_predictions = CategoricalValue{String, UInt32}["versicolor", "versicolor", "versicolor"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - - solem.models[2].negconsequent.posconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" - solem.models[2].negconsequent.posconsequent.negconsequent.info = - (leaf_values = -0.0120000029, - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - - solem.models[2].negconsequent.negconsequent.antecedent = - Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_width] < 1.70000005 - solem.models[2].negconsequent.negconsequent.info = - (leaf_values = [-0.0206896588, -0.0707006454], - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) - - solem.models[2].negconsequent.negconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "virginica" - solem.models[2].negconsequent.negconsequent.posconsequent.info = - (leaf_values = -0.0206896588, - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - - solem.models[2].negconsequent.negconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" - solem.models[2].negconsequent.negconsequent.negconsequent.info = - (leaf_values = -0.0707006454, - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor"],) -""" - -### TREE 3 -""" -xtrs[3].cover = 53.3333282 -xtrs[3].gain = 51.9276886 -xtrs[3].nmissing = 2 -xtrs[3].yes = 1 -xtrs[3].no = 2 -xtrs[3].split = "petal_length" -xtrs[3].split_condition = 4.80000019 - - xtrs[3].children[1].cover = 32.8888855 - xtrs[3].children[1].gain = 0.676908493 - xtrs[3].children[1].nmissing = 4 - xtrs[3].children[1].yes = 3 - xtrs[3].children[1].no = 4 - xtrs[3].children[1].split = "petal_width" - xtrs[3].children[1].split_condition = 1.60000002 - - xtrs[3].children[1].children[1].cover = 31.5555534 - xtrs[3].children[1].children[1].id = 3 - xtrs[3].children[1].children[1].leaf = -0.0726962537 - - xtrs[3].children[1].children[2].cover = 1.33333325 - xtrs[3].children[1].children[2].id = 4 - xtrs[3].children[1].children[2].leaf = -2.55448485e-9 - - xtrs[3].children[2].cover = 20.4444427 - xtrs[3].children[2].gain = 1.53349686 - xtrs[3].children[2].nmissing = 6 - xtrs[3].children[2].yes = 5 - xtrs[3].children[2].no = 6 - xtrs[3].children[2].split = "petal_length" - xtrs[3].children[2].split_condition = 4.9000001 - - xtrs[3].children[2].children[1].cover = 1.77777767 - xtrs[3].children[2].children[1].id = 5 - xtrs[3].children[2].children[1].leaf = 0.0239999983 - - xtrs[3].children[2].children[2].cover = 18.666666 - xtrs[3].children[2].children[2].id = 6 - xtrs[3].children[2].children[2].leaf = 0.137288138 - -solem.models[3].info = -(leaf_values = [-0.0726962537, -2.55448485e-9, 0.0239999983, 0.137288138], - supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) -solem.models[3].antecedent = Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 4.80000019 - - solem.models[3].posconsequent.info = - (leaf_values = [-0.0726962537, -2.55448485e-9], - supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) - solem.models[3].posconsequent.antecedent = Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_width] < 1.60000002 - - solem.models[3].posconsequent.posconsequent.info - (leaf_values = -0.0726962537, - supporting_predictions = CategoricalValue{String, UInt32}["setosa", "setosa", "setosa"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - solem.models[3].posconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "setosa" - - solem.models[3].posconsequent.negconsequent.info = - (leaf_values = -2.55448485e-9, - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - solem.models[3].posconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" - - solem.models[3].negconsequent.info = - (leaf_values = [0.0239999983, 0.137288138], - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica", "virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor", "setosa", "virginica", "versicolor"],) - solem.models[3].negconsequent.antecedent = Atom{ScalarCondition{Float64, VariableValue{Int64, Symbol}, ScalarMetaCondition{VariableValue{Int64, Symbol}, typeof(<)}}}: [petal_length] < 4.9000001 - - solem.models[3].negconsequent.posconsequent.info = - (leaf_values = 0.0239999983, - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - solem.models[3].negconsequent.posconsequent.outcome = CategoricalValue{String, UInt32} "virginica" - - solem.models[3].negconsequent.negconsequent.info = - (leaf_values = 0.137288138, - supporting_predictions = CategoricalValue{String, UInt32}["virginica", "virginica", "virginica"], - supporting_labels = ["setosa", "virginica", "versicolor"],) - solem.models[3].negconsequent.negconsequent.outcome = CategoricalValue{String, UInt32} "virginica" -""" - -# calculating the probabilities - -# Row │ sepal_length sepal_width petal_length petal_width -# │ Float64 Float64 Float64 Float64 -# ─────┼────────────────────────────────────────────────────── -# 1 │ 6.9 3.1 4.9 1.5 - -### TREE 1: probability of setosa -""" -"petal_length" < 3.0 -- no >> leaf = -0.072997041 -""" - -### TREE 2: probability of versicolor -""" -"petal_length" < 3.0 -- no > "petal_length" < 4.9000001 -- yes > "petal_width" < 1.70000005 -- yes >> leaf = 0.141176477 -""" - -### TREE 1: probability of virginica -""" -"petal_length" < 4.80000019 -- no > "petal_length" < 4.9000001 -- yes >> leaf = 0.0239999983 -""" - -### calculatin multi:softprob -""" -exp_preds = exp.(-0.072997041 0.141176477 0.0239999983) = 0.929604 1.15163 1.02429 -row_sums = sum(exp_preds, dims=2) = 3.1055217627515077 -probability = exp_preds / row_sums = 0.299339 0.370832 0.329829 - -XGBoost probability: 0.304161 0.320495 0.375344 -""" - -""" -### ragionamento per assurdo: problema di arrotondamento ### -"petal_length" = 4.9 -"petal_length" < 4.9000001 viene valutato come false - -quindi: -# tree 2 -"petal_length" < 3.0 -- no > "petal_length" < 4.9000001 -- no > "petal_width" < 1.70000005 -- yes >> leaf = -0.0206896588 -# tree 3 -"petal_length" < 4.80000019 -- no > "petal_length" < 4.9000001 -- no >> leaf = 0.137288138 -""" -exp_preds = exp.([-0.072997041, -0.0206896588, 0.137288138]) -row_sums = sum(exp_preds) -probability = exp_preds ./ row_sums - -""" -# 3-element Vector{Float64}: -# 0.3041612750760762 -# 0.320494608175597 -# 0.3753441167483268 - -# XGBoost probability: 0.304161 0.320495 0.375344 - -PROBLEMA RISOLTO -se si valuta -4.9 < 4.9000001 false -allora si ottiene il risaultato del predict XGBoost -""" \ No newline at end of file +class_probs32 = predict_xgboost_bag(xtrs32, X_test32; n_classes=3, objective="multi:softprob") # NOT WORKING From ae4e5a1323bd5f1e0fc927d2fca34d7befb5c8b8 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 00:02:50 +0200 Subject: [PATCH 14/44] working on tests --- ext/XGBoostExt.jl | 2 +- test/XGBoostExt.jl | 47 ------------------- .../xgboost_classifier.jl} | 2 + .../xgboost_predict_issue.jl | 0 test/misc.jl | 4 +- test/runtests.jl | 2 +- 6 files changed, 6 insertions(+), 51 deletions(-) delete mode 100644 test/XGBoostExt.jl rename test/{XgBoostExt/xgboost.jl => XGBoostExt/xgboost_classifier.jl} (98%) rename test/{XgBoostExt => XGBoostExt}/xgboost_predict_issue.jl (100%) diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index 5e58c62..25df6d3 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -8,7 +8,7 @@ using CategoricalArrays import SoleModels: alphabet, solemodel function alphabet(model::XGBoost.Booster; kwargs...) - error("TODO fix and test.") + # error("TODO fix and test.") function _alphabet!(a::Vector, model::XGBoost.Booster; kwargs...) return a end diff --git a/test/XGBoostExt.jl b/test/XGBoostExt.jl deleted file mode 100644 index 1ae6c42..0000000 --- a/test/XGBoostExt.jl +++ /dev/null @@ -1,47 +0,0 @@ - -# Import necessary libraries -using MLJ -using DataFrames - -# Load the Iris dataset -X, y = @load_iris -X = DataFrame(X) - -# Convert the target variable to categorical -y = coerce(y, Multiclass) - -# Split the dataset into training and testing sets -train, test = partition(eachindex(y), 0.8, shuffle=true) -X_train, X_test = X[train, :], X[test, :] -y_train, y_test = y[train], y[test] - -# Load the XGBoost classifier -XGBoostClassifier = @load XGBoostClassifier pkg=XGBoost - -# Create the model and set hyperparameters -mljmodel = XGBoostClassifier() - -# Wrap the model with the data -mach = machine(mljmodel, X_train, y_train) - -# Train the model -fit!(mach) - -# Make predictions -y_pred = predict(mach, X_test) - -# Evaluate test accuracy -acc = mean(mode.(y_pred) .== y_test) - -# Print the test accuracy -println("Test Accuracy: $acc") - - - -using SoleModels - -@test_nowarn alphabet(fitted_params(mach).fitresult[1]) - -model = fitted_params(mach).fitresult[1] - -@test_broken solemodel(model) diff --git a/test/XgBoostExt/xgboost.jl b/test/XGBoostExt/xgboost_classifier.jl similarity index 98% rename from test/XgBoostExt/xgboost.jl rename to test/XGBoostExt/xgboost_classifier.jl index 84983b1..b2aa9cc 100644 --- a/test/XgBoostExt/xgboost.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -87,6 +87,8 @@ ŷ = XGB.predict(bst, X_test) predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 @test predsl == ŷ +@test_nowarn alphabet(fitted_params(mach).fitresult[1]) + for seed in 1:40 rng = Xoshiro(seed) train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) diff --git a/test/XgBoostExt/xgboost_predict_issue.jl b/test/XGBoostExt/xgboost_predict_issue.jl similarity index 100% rename from test/XgBoostExt/xgboost_predict_issue.jl rename to test/XGBoostExt/xgboost_predict_issue.jl diff --git a/test/misc.jl b/test/misc.jl index cbafaf2..4af76b4 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -1,6 +1,6 @@ -using Revise +# using Revise -using Reexport +# using Reexport using FunctionWrappers: FunctionWrapper using Test using SoleLogics diff --git a/test/runtests.jl b/test/runtests.jl index 8d95df5..b6d170a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -22,7 +22,7 @@ test_suites = [ ("Linear forms", ["linear-form-utilities.jl", ]), ("Pluto Demo", ["$(dirname(dirname(pathof(SoleModels))))/pluto-demo.jl", ]), ("DecisionTreeExt", ["DecisionTreeExt/tree.jl", "DecisionTreeExt/forest.jl"]), - ("XGBoostExt", ["XGBoostExt.jl"]), + ("XGBoostExt", ["XGBoostExt/xgboost_classifier.jl"]), ] @testset "SoleModels.jl" begin From 52deed0eb603cfb8da24718c54734773f7dd10a7 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 00:37:37 +0200 Subject: [PATCH 15/44] codecov working --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index 6a4f80a..caab585 100644 --- a/Project.toml +++ b/Project.toml @@ -33,6 +33,7 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [weakdeps] DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9" +MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" [extensions] DecisionTreeExt = "DecisionTree" From 7c4f0256487bc6b6ad454c6a67df98e143c565e0 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 01:04:50 +0200 Subject: [PATCH 16/44] removed MLJXGBoostInterface in tests --- Project.toml | 1 - ext/XGBoostExt.jl | 4 +--- test/XGBoostExt/xgboost_classifier.jl | 2 +- test/XGBoostExt/xgboost_predict_issue.jl | 5 ++++- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Project.toml b/Project.toml index caab585..6a4f80a 100644 --- a/Project.toml +++ b/Project.toml @@ -33,7 +33,6 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [weakdeps] DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9" -MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" [extensions] DecisionTreeExt = "DecisionTree" diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index 25df6d3..18ea25b 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -115,9 +115,7 @@ end function early_return(leaf, antecedent, clabel, classl) info =(; - # leaf_values = leaf, - ### debug convert to Float32 TODO delete - leaf_values = Float32(leaf), + leaf_values = leaf, supporting_predictions = clabel, supporting_labels = [classl], ) diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index b2aa9cc..f02de30 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -4,7 +4,7 @@ using MLJ using MLJBase using DataFrames -using MLJXGBoostInterface +# using MLJXGBoostInterface using SoleModels import MLJModelInterface as MMI diff --git a/test/XGBoostExt/xgboost_predict_issue.jl b/test/XGBoostExt/xgboost_predict_issue.jl index e6d0a9f..0a9af41 100644 --- a/test/XGBoostExt/xgboost_predict_issue.jl +++ b/test/XGBoostExt/xgboost_predict_issue.jl @@ -1,6 +1,6 @@ using MLJ using DataFrames -using MLJXGBoostInterface +# using MLJXGBoostInterface import MLJModelInterface as MMI using SoleModels import XGBoost as XGB @@ -124,12 +124,15 @@ featurenames = mach.report.vals[1].features solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=false) preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # NOT WORKING +@test preds[1] == "versicolor" solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=true) preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # WORKING +@test preds[1] == "virginica" solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # WORKING +@test preds[1] == "virginica" predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 From 659322252fe95d98b17b8dee0411c5e04e0c71a5 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 01:15:58 +0200 Subject: [PATCH 17/44] still fixing codecov --- Project.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Project.toml b/Project.toml index 6a4f80a..8122531 100644 --- a/Project.toml +++ b/Project.toml @@ -52,10 +52,6 @@ Graphs = "1.8" HTTP = "1.9" IterTools = "1" Lazy = "0.15.1" -MLJ = "0.19 - 0.20" -MLJBase = "1.6 - 1.7" -MLJDecisionTreeInterface = "0.4" -MLJModelInterface = "1.8" PrettyTables = "2.2" ProgressMeter = "1" Random = "1" From 2828e50296a04ba3544cbbd9450388e5dc5a99d6 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 01:16:47 +0200 Subject: [PATCH 18/44] again --- Project.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8122531..81745e0 100644 --- a/Project.toml +++ b/Project.toml @@ -76,7 +76,6 @@ MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" -MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba" PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8" From 7c3fbde3d2b338278f88ea767f43d7bb6723c5df Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 01:21:59 +0200 Subject: [PATCH 19/44] guess what? --- Project.toml | 2 +- test/XGBoostExt/xgboost_classifier.jl | 1 - test/XGBoostExt/xgboost_predict_issue.jl | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 81745e0..fa6d55d 100644 --- a/Project.toml +++ b/Project.toml @@ -84,4 +84,4 @@ SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "DataFrames", "Random", "MLJ", "MLJXGBoostInterface", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJDecisionTreeInterface", "SoleData"] +test = ["Test", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJDecisionTreeInterface", "SoleData"] diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index f02de30..2c1e76e 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -4,7 +4,6 @@ using MLJ using MLJBase using DataFrames -# using MLJXGBoostInterface using SoleModels import MLJModelInterface as MMI diff --git a/test/XGBoostExt/xgboost_predict_issue.jl b/test/XGBoostExt/xgboost_predict_issue.jl index 0a9af41..198093f 100644 --- a/test/XGBoostExt/xgboost_predict_issue.jl +++ b/test/XGBoostExt/xgboost_predict_issue.jl @@ -1,6 +1,6 @@ using MLJ using DataFrames -# using MLJXGBoostInterface + import MLJModelInterface as MMI using SoleModels import XGBoost as XGB From a9acf2ec447430748d827475eaffa454f8006d99 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 01:33:52 +0200 Subject: [PATCH 20/44] update ci.yml breaking! --- .github/workflows/ci.backup | 21 +++++++++++++++++++++ .github/workflows/ci.yml | 27 +++++++++++++++++++++------ 2 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/ci.backup diff --git a/.github/workflows/ci.backup b/.github/workflows/ci.backup new file mode 100644 index 0000000..cdf957d --- /dev/null +++ b/.github/workflows/ci.backup @@ -0,0 +1,21 @@ +name: Upload coverage reports to Codecov +on: [push, pull_request] +jobs: + run: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Set up Julia 1.9.0 + uses: julia-actions/setup-julia@v1 + with: + version: "1.9.0" + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: aclai-lab/SoleModels.jl + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba475a2..45b1c86 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,15 +6,30 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + - name: Set up Julia 1.9.0 uses: julia-actions/setup-julia@v1 with: version: "1.9.0" - - uses: julia-actions/julia-buildpkg@v1 - - uses: julia-actions/julia-runtest@v1 - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v5 + + - name: Build package + uses: julia-actions/julia-buildpkg@v1 + + - name: Build test dependencies + run: | + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate() + shell: julia --project=test {0} + + - name: Run tests + uses: julia-actions/julia-runtest@v1 + + - name: Process coverage + uses: julia-actions/julia-processcoverage@v1 + + - name: Upload coverage + uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - slug: aclai-lab/SoleModels.jl - + slug: aclai-lab/SoleModels.jl \ No newline at end of file From 828c4468ae48593ea22419163e94cbcc4780bf0b Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 01:48:30 +0200 Subject: [PATCH 21/44] again --- .github/workflows/ci.yml | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45b1c86..f836f3d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,30 +6,16 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 - - name: Set up Julia 1.9.0 uses: julia-actions/setup-julia@v1 with: version: "1.9.0" - - - name: Build package - uses: julia-actions/julia-buildpkg@v1 - - - name: Build test dependencies - run: | - using Pkg - Pkg.develop(PackageSpec(path=pwd())) - Pkg.instantiate() - shell: julia --project=test {0} - - - name: Run tests - uses: julia-actions/julia-runtest@v1 - - - name: Process coverage - uses: julia-actions/julia-processcoverage@v1 - - - name: Upload coverage - uses: codecov/codecov-action@v5 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + with: + project: test + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} slug: aclai-lab/SoleModels.jl \ No newline at end of file From a7ced5b4ca44cb96f753b37bcfc200c2e645cbcd Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 01:53:25 +0200 Subject: [PATCH 22/44] last try --- .github/workflows/ci.yml | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f836f3d..a05082c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,16 +6,33 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 + - name: Set up Julia 1.9.0 uses: julia-actions/setup-julia@v1 with: version: "1.9.0" - - uses: julia-actions/julia-buildpkg@v1 - - uses: julia-actions/julia-runtest@v1 - with: - project: test - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v5 + + - name: Build package + uses: julia-actions/julia-buildpkg@v1 + + - name: Setup test environment + run: | + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate() + shell: julia --project=test {0} + + - name: Run tests + run: | + using Pkg + Pkg.test("SoleModels", coverage=true) + shell: julia --project=test {0} + + - name: Process coverage + uses: julia-actions/julia-processcoverage@v1 + + - name: Upload coverage + uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} slug: aclai-lab/SoleModels.jl \ No newline at end of file From fba387a4d7e7db0ddd33d8df582a8b49ed9cd797 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 12:28:30 +0200 Subject: [PATCH 23/44] reverted ci.yml --- .github/workflows/ci.backup | 21 --------------------- .github/workflows/ci.yml | 27 ++++----------------------- 2 files changed, 4 insertions(+), 44 deletions(-) delete mode 100644 .github/workflows/ci.backup diff --git a/.github/workflows/ci.backup b/.github/workflows/ci.backup deleted file mode 100644 index cdf957d..0000000 --- a/.github/workflows/ci.backup +++ /dev/null @@ -1,21 +0,0 @@ -name: Upload coverage reports to Codecov -on: [push, pull_request] -jobs: - run: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Set up Julia 1.9.0 - uses: julia-actions/setup-julia@v1 - with: - version: "1.9.0" - - uses: julia-actions/julia-buildpkg@v1 - - uses: julia-actions/julia-runtest@v1 - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - slug: aclai-lab/SoleModels.jl - - diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a05082c..9cdf0cf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,33 +6,14 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 - - name: Set up Julia 1.9.0 uses: julia-actions/setup-julia@v1 with: version: "1.9.0" - - - name: Build package - uses: julia-actions/julia-buildpkg@v1 - - - name: Setup test environment - run: | - using Pkg - Pkg.develop(PackageSpec(path=pwd())) - Pkg.instantiate() - shell: julia --project=test {0} - - - name: Run tests - run: | - using Pkg - Pkg.test("SoleModels", coverage=true) - shell: julia --project=test {0} - - - name: Process coverage - uses: julia-actions/julia-processcoverage@v1 - - - name: Upload coverage - uses: codecov/codecov-action@v5 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} slug: aclai-lab/SoleModels.jl \ No newline at end of file From 1e0971f640f533e76b3c7b6a7eb516b7ec36f726 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 13:28:47 +0200 Subject: [PATCH 24/44] atom getter --- ext/XGBoostExt.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index 18ea25b..e25661e 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -102,10 +102,14 @@ function get_condition(class_idx, featurenames; test_operator, featval) return ScalarCondition(feature, test_operator, featval) end +get_operator(atom::Atom{<:ScalarCondition}) = atom.value.metacond.test_operator +get_i_variable(atom::Atom{<:ScalarCondition}) = atom.value.metacond.feature.i_variable +get_threshold(atom::Atom{<:ScalarCondition}) = atom.value.threshold + function satisfies_conditions(row, formula) - all(atom -> atom.value.metacond.test_operator( - row[atom.value.metacond.feature.i_variable], - atom.value.threshold), formula + all(atom -> get_operator(atom)( + row[get_i_variable(atom)], + get_threshold(atom)), formula ) end From 53161b1624796bdc649ce748276db08227350f74 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Tue, 15 Apr 2025 16:43:49 +0200 Subject: [PATCH 25/44] adaboost test --- ext/XGBoostExt.jl | 3 + test/DecisionTreeExt/adaboost.jl | 144 ++++++++++++++++++++++++++ test/XGBoostExt/xgboost_classifier.jl | 46 ++++---- test/runtests.jl | 2 +- 4 files changed, 173 insertions(+), 22 deletions(-) create mode 100644 test/DecisionTreeExt/adaboost.jl diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index e25661e..ff19594 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -7,6 +7,9 @@ using CategoricalArrays import SoleModels: alphabet, solemodel +# ---------------------------------------------------------------------------- # +# DecisionXGBoost alphabet # +# ---------------------------------------------------------------------------- # function alphabet(model::XGBoost.Booster; kwargs...) # error("TODO fix and test.") function _alphabet!(a::Vector, model::XGBoost.Booster; kwargs...) diff --git a/test/DecisionTreeExt/adaboost.jl b/test/DecisionTreeExt/adaboost.jl new file mode 100644 index 0000000..da3fff7 --- /dev/null +++ b/test/DecisionTreeExt/adaboost.jl @@ -0,0 +1,144 @@ +using Test + +using MLJ +using MLJBase +using DataFrames + +using MLJDecisionTreeInterface +using SoleModels +using Random + +import DecisionTree as DT + +X, y = @load_iris +X = DataFrame(X) + +train_ratio = 0.7 +rng = Xoshiro(11) + +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) +X_train, y_train = X[train, :], y[train] +X_test, y_test = X[test, :], y[test] + +println("Training set size: ", size(X_train), " - ", size(y_train)) +println("Test set size: ", size(X_test), " - ", size(y_test)) +println("Training set type: ", typeof(X_train), " - ", typeof(y_train)) +println("Test set type: ", typeof(X_test), " - ", typeof(y_test)) + +# ---------------------------------------------------------------------------- # +# AdaBoost solemodel # +# ---------------------------------------------------------------------------- # +Stump = MLJ.@load AdaBoostStumpClassifier pkg=DecisionTree + +model = Stump(; + n_iter=10, + feature_importance=:impurity, + rng +) + +# Bind the model and data into a machine +mach = machine(model, X_train, y_train) +# Fit the model +fit!(mach) + +weights = mach.fitresult[2] +classlabels = sort(mach.fitresult[3]) +featurenames = MLJ.report(mach).features + +solem = solemodel(MLJ.fitted_params(mach).stumps; weights, classlabels, featurenames) +solem = solemodel(MLJ.fitted_params(mach).stumps; weights, classlabels, featurenames, keep_condensed = false) + +@test SoleData.scalarlogiset(X_test; allow_propositional = true) isa PropositionalLogiset + +# Make test instances flow into the model +preds = apply(solem, X_test) +preds2 = apply!(solem, X_test, y_test) + +@test preds == preds2 + +# apply!(solem, X_test, y_test, mode = :append) + +printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) + +# @test_broken printmodel.(listrules(solem, min_lift = 1.0, min_ninstances = 0); show_metrics = true); + +# ---------------------------------------------------------------------------- # +# AdaBoost decisiontree # +# ---------------------------------------------------------------------------- # +# train adaptive-boosted stumps, using 10 iterations +dt_model, dt_coeffs = DT.build_adaboost_stumps(y_train, Matrix(X_train), 10) +# apply learned model +dt_preds = apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) +# get the probability of each label +dt_proba = apply_adaboost_stumps_proba(dt_model, dt_coeffs, Matrix(X_test), classlabels) + +@test preds == dt_preds + +# ---------------------------------------------------------------------------- # +# Accuracy # +# ---------------------------------------------------------------------------- # +ada_accuracy = sum(preds .== y_test)/length(y_test) +# @test accuracy >= 0.8 + +# decision tree +Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree +dt_model = Tree(max_depth=-1, min_samples_leaf=1, min_samples_split=2) +dt_mach = machine(dt_model, X_train, y_train) +fit!(dt_mach) +dt_solem = solemodel(fitted_params(dt_mach).tree) +dt_preds = apply(dt_solem, X_test) +dt_accuracy = sum(dt_preds .== y_test)/length(y_test) + +# random forest +Forest = MLJ.@load RandomForestClassifier pkg=DecisionTree +rm_model = Forest(max_depth=3, min_samples_leaf=1, min_samples_split=2, n_trees=10) +rm_mach = machine(rm_model, X_train, y_train) +fit!(rm_mach) +classlabels = (rm_mach).fitresult[2] +classlabels = classlabels[sortperm((rm_mach).fitresult[3])] +featurenames = report(rm_mach).features +rm_solem = solemodel(fitted_params(rm_mach).forest; classlabels, featurenames) +rm_preds = apply(rm_solem, X_test) +rm_accuracy = sum(rm_preds .== y_test)/length(y_test) + +println("AdaBoost accuracy: ", ada_accuracy) +println("DecisionTree accuracy: ", dt_accuracy) +println("RandomForest accuracy: ", rm_accuracy) + +@test ada_accuracy ≥ rm_accuracy ≥ dt_accuracy + +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + Stump = MLJ.@load AdaBoostStumpClassifier pkg=DecisionTree + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for n_iter in 10:10:100 + # solemodel + model = Stump(; n_iter, rng=Xoshiro(seed)) + mach = machine(model, X_train, y_train) + fit!(mach) + weights = mach.fitresult[2] + classlabels = sort(mach.fitresult[3]) + featurenames = MLJ.report(mach).features + solem = solemodel(MLJ.fitted_params(mach).stumps; weights, classlabels, featurenames) + preds = apply(solem, X_test) + + # decisiontree + yl_train = CategoricalArrays.levelcode.(y_train) + dt_model, dt_coeffs = DT.build_adaboost_stumps(yl_train, Matrix(X_train), n_iter; rng=Xoshiro(seed)) + dt_preds = apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) + + code_preds = CategoricalArrays.levelcode.(preds) + @test code_preds == dt_preds + end + end + end +end + diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index 2c1e76e..3c9e7e1 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -88,27 +88,31 @@ predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 @test_nowarn alphabet(fitted_params(mach).fitresult[1]) -for seed in 1:40 - rng = Xoshiro(seed) - train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) - X_train, y_train = X[train, :], y[train] - X_test, y_test = X[test, :], y[test] - for num_round in 10:10:100 - for eta in 0.1:0.1:0.9 - model = XGTrees(; num_round, eta, objective="multi:softmax") - mach = machine(model, X_train, y_train) - fit!(mach) - trees = XGB.trees(mach.fitresult[1]) - solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=true) - X_test_f32 = mapcols(col -> Float32.(col), X_test) - preds = apply(solem, X_test_f32) - predsl = CategoricalArrays.levelcode.(categorical(preds)) - - yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 - bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") - ŷ = XGB.predict(bst, X_test) - - @test (predsl .-1) == ŷ +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + for num_round in 10:10:100 + for eta in 0.1:0.1:0.9 + model = XGTrees(; num_round, eta, objective="multi:softmax") + mach = machine(model, X_train, y_train) + fit!(mach) + trees = XGB.trees(mach.fitresult[1]) + solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=true) + X_test_f32 = mapcols(col -> Float32.(col), X_test) + preds = apply(solem, X_test_f32) + predsl = CategoricalArrays.levelcode.(categorical(preds)) + + yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 + bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") + ŷ = XGB.predict(bst, X_test) + + @test (predsl .-1) == ŷ + end end end end diff --git a/test/runtests.jl b/test/runtests.jl index b6d170a..4e779db 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -21,7 +21,7 @@ test_suites = [ ("Rules", ["juliacon2024.jl", ]), ("Linear forms", ["linear-form-utilities.jl", ]), ("Pluto Demo", ["$(dirname(dirname(pathof(SoleModels))))/pluto-demo.jl", ]), - ("DecisionTreeExt", ["DecisionTreeExt/tree.jl", "DecisionTreeExt/forest.jl"]), + ("DecisionTreeExt", ["DecisionTreeExt/tree.jl", "DecisionTreeExt/forest.jl", "DecisionTreeExt/adaboost.jl"]), ("XGBoostExt", ["XGBoostExt/xgboost_classifier.jl"]), ] From d7a5d15d5e7d743ad604b5b2b360d4577c1b445e Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Wed, 16 Apr 2025 16:50:51 +0200 Subject: [PATCH 26/44] xgboost classifier ready, tested and working --- ext/XGBoostExt.jl | 18 ++-- test/DecisionTreeExt/adaboost.jl | 19 ++-- test/XGBoostExt/xgboost_classifier.jl | 134 +++++++++++++++++--------- 3 files changed, 107 insertions(+), 64 deletions(-) diff --git a/ext/XGBoostExt.jl b/ext/XGBoostExt.jl index ff19594..fdf9639 100644 --- a/ext/XGBoostExt.jl +++ b/ext/XGBoostExt.jl @@ -122,9 +122,9 @@ end function early_return(leaf, antecedent, clabel, classl) info =(; - leaf_values = leaf, - supporting_predictions = clabel, - supporting_labels = [classl], + leaf_values=leaf, + supporting_predictions=clabel, + supporting_labels=[classl], ) return Branch( @@ -212,10 +212,6 @@ split_condition = use_float32 ? Float32(tree.split_condition) : tree.split_condi else SoleModels.solemodel(tree.children[1], X, y; path_conditions=left_path, classlabels, class_idx, clabels, featurenames, use_float32) end - isnothing(lefttree) && - begin - return early_return(tree.children[1].leaf, antecedent, clabels, classlabels[class_idx]) - end righttree = if isnothing(tree.children[2].split) # @show SoleModels.join_antecedents(right_path) @@ -223,10 +219,6 @@ split_condition = use_float32 ? Float32(tree.split_condition) : tree.split_condi else SoleModels.solemodel(tree.children[2], X, y; path_conditions=right_path, classlabels, class_idx, clabels, featurenames, use_float32) end - isnothing(righttree) && - begin - return early_return(tree.children[2].leaf, antecedent, clabels, classlabels[class_idx]) - end info = (; leaf_values = [lefttree.info[:leaf_values]..., righttree.info[:leaf_values]...], @@ -244,7 +236,11 @@ function xgbleaf( use_float32::Bool, ) bitX = bitmap_check_conditions(X, formula) + + # this could happens when the split condition doesn't match any class + !any(bitX) && (bitX = trues(length(y))) prediction = SoleModels.bestguess(y[bitX]; suppress_parity_warning=true) + labels = unique(y) isnothing(prediction) && return nothing diff --git a/test/DecisionTreeExt/adaboost.jl b/test/DecisionTreeExt/adaboost.jl index da3fff7..1c2b921 100644 --- a/test/DecisionTreeExt/adaboost.jl +++ b/test/DecisionTreeExt/adaboost.jl @@ -3,6 +3,7 @@ using Test using MLJ using MLJBase using DataFrames +using CategoricalArrays using MLJDecisionTreeInterface using SoleModels @@ -14,7 +15,7 @@ X, y = @load_iris X = DataFrame(X) train_ratio = 0.7 -rng = Xoshiro(11) +rng = Xoshiro(1) train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) X_train, y_train = X[train, :], y[train] @@ -39,7 +40,7 @@ model = Stump(; # Bind the model and data into a machine mach = machine(model, X_train, y_train) # Fit the model -fit!(mach) +fit!(mach, verbosity=0) weights = mach.fitresult[2] classlabels = sort(mach.fitresult[3]) @@ -68,9 +69,9 @@ printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = # train adaptive-boosted stumps, using 10 iterations dt_model, dt_coeffs = DT.build_adaboost_stumps(y_train, Matrix(X_train), 10) # apply learned model -dt_preds = apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) +dt_preds = DT.apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) # get the probability of each label -dt_proba = apply_adaboost_stumps_proba(dt_model, dt_coeffs, Matrix(X_test), classlabels) +dt_proba = DT.apply_adaboost_stumps_proba(dt_model, dt_coeffs, Matrix(X_test), classlabels) @test preds == dt_preds @@ -84,16 +85,16 @@ ada_accuracy = sum(preds .== y_test)/length(y_test) Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree dt_model = Tree(max_depth=-1, min_samples_leaf=1, min_samples_split=2) dt_mach = machine(dt_model, X_train, y_train) -fit!(dt_mach) +fit!(dt_mach, verbosity=0) dt_solem = solemodel(fitted_params(dt_mach).tree) dt_preds = apply(dt_solem, X_test) dt_accuracy = sum(dt_preds .== y_test)/length(y_test) # random forest Forest = MLJ.@load RandomForestClassifier pkg=DecisionTree -rm_model = Forest(max_depth=3, min_samples_leaf=1, min_samples_split=2, n_trees=10) +rm_model = Forest(; max_depth=3, min_samples_leaf=1, min_samples_split=2, n_trees=10, rng) rm_mach = machine(rm_model, X_train, y_train) -fit!(rm_mach) +fit!(rm_mach, verbosity=0) classlabels = (rm_mach).fitresult[2] classlabels = classlabels[sortperm((rm_mach).fitresult[3])] featurenames = report(rm_mach).features @@ -123,7 +124,7 @@ println("RandomForest accuracy: ", rm_accuracy) # solemodel model = Stump(; n_iter, rng=Xoshiro(seed)) mach = machine(model, X_train, y_train) - fit!(mach) + fit!(mach, verbosity=0) weights = mach.fitresult[2] classlabels = sort(mach.fitresult[3]) featurenames = MLJ.report(mach).features @@ -133,7 +134,7 @@ println("RandomForest accuracy: ", rm_accuracy) # decisiontree yl_train = CategoricalArrays.levelcode.(y_train) dt_model, dt_coeffs = DT.build_adaboost_stumps(yl_train, Matrix(X_train), n_iter; rng=Xoshiro(seed)) - dt_preds = apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) + dt_preds = DT.apply_adaboost_stumps(dt_model, dt_coeffs, Matrix(X_test)) code_preds = CategoricalArrays.levelcode.(preds) @test code_preds == dt_preds diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index 3c9e7e1..d25b661 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -14,7 +14,7 @@ using Random, CategoricalArrays X, y = @load_iris X = DataFrame(X) -train_ratio = 0.8 +train_ratio = 0.7 rng = Xoshiro(11) train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) @@ -26,92 +26,138 @@ println("Test set size: ", size(X_test), " - ", size(y_test)) println("Training set type: ", typeof(X_train), " - ", typeof(y_train)) println("Test set type: ", typeof(X_test), " - ", typeof(y_test)) +# ---------------------------------------------------------------------------- # +# XGBoost solemodel # +# ---------------------------------------------------------------------------- # XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost model = XGTrees(; - num_round=1, - max_depth=6, + num_round=10, + tree_method="exact", objective="multi:softmax" ) # Bind the model and data into a machine mach = machine(model, X_train, y_train) # Fit the model -fit!(mach) - -trees = XGB.trees(mach.fitresult[1]) +fit!(mach; verbosity=0) get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in MMI.classes(classes_seen)) get_classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] +trees = XGB.trees(mach.fitresult[1]) encoding = get_encoding(mach.fitresult[2]) classlabels = get_classlabels(encoding) featurenames = mach.report.vals[1].features -# ds_safetest = vcat(y_train, "nothing") -# solem = solemodel(trees, Matrix(X_train), y_train) solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, keep_condensed = false) @test SoleData.scalarlogiset(X_test; allow_propositional = true) isa PropositionalLogiset # Make test instances flow into the model -preds = apply(solem, X_test) +X_test_f32 = mapcols(col -> Float32.(col), X_test) +preds = apply(solem, X_test_f32) +predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 + +# TODO fix in rule-and-branch.jl # preds2 = apply!(solem, X_test, y_test) # @test preds == preds2 -accuracy = sum(preds .== y_test)/length(y_test) -@test accuracy > 0.9 - -# apply!(solem, X_test, y_test, mode = :append) - -solem = @test_throws ErrorException solemodel(trees, Matrix(X_train), y_train; classlabels, keep_condensed = true) -solem = @test_nowarn solemodel(trees, Matrix(X_train), y_train; classlabels, keep_condensed = false) - -printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) - -# comparision with XGBoost.jl +# ---------------------------------------------------------------------------- # +# julia XGBoost # +# ---------------------------------------------------------------------------- # yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 # create and train a gradient boosted tree model of 5 trees bst = XGB.xgboost( (X_train, yl_train), num_round=10, num_class=3, - max_depth=6, + tree_method="exact", objective="multi:softmax" ) # obtain model predictions -ŷ = XGB.predict(bst, X_test) +xg_preds = XGB.predict(bst, X_test) -predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 -@test predsl == ŷ +@test predsl == xg_preds +# ---------------------------------------------------------------------------- # +# Accuracy # +# ---------------------------------------------------------------------------- # +xg_accuracy = sum(preds .== y_test)/length(y_test) +# @test accuracy >= 0.8 + +# decision tree +Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree +dt_model = Tree(max_depth=-1, min_samples_leaf=1, min_samples_split=2) +dt_mach = machine(dt_model, X_train, y_train) +fit!(dt_mach, verbosity=0) +dt_solem = solemodel(fitted_params(dt_mach).tree) +dt_preds = apply(dt_solem, X_test) +dt_accuracy = sum(dt_preds .== y_test)/length(y_test) + +# random forest +Forest = MLJ.@load RandomForestClassifier pkg=DecisionTree +rm_model = Forest(;max_depth=3, min_samples_leaf=1, min_samples_split=2, n_trees=10, rng) +rm_mach = machine(rm_model, X_train, y_train) +fit!(rm_mach, verbosity=0) +classlabels = (rm_mach).fitresult[2] +classlabels = classlabels[sortperm((rm_mach).fitresult[3])] +featurenames = report(rm_mach).features +rm_solem = solemodel(fitted_params(rm_mach).forest; classlabels, featurenames) +rm_preds = apply(rm_solem, X_test) +rm_accuracy = sum(rm_preds .== y_test)/length(y_test) + +println("XGBoost accuracy: ", xg_accuracy) +println("DecisionTree accuracy: ", dt_accuracy) +println("RandomForest accuracy: ", rm_accuracy) + +@test xg_accuracy ≥ rm_accuracy ≥ dt_accuracy + +# ---------------------------------------------------------------------------- # +# XGBoost Alphabet # +# ---------------------------------------------------------------------------- # @test_nowarn alphabet(fitted_params(mach).fitresult[1]) # ---------------------------------------------------------------------------- # # Data Validation # # ---------------------------------------------------------------------------- # @testset "data validation" begin - for seed in 1:40 - train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) - X_train, y_train = X[train, :], y[train] - X_test, y_test = X[test, :], y[test] - for num_round in 10:10:100 - for eta in 0.1:0.1:0.9 - model = XGTrees(; num_round, eta, objective="multi:softmax") - mach = machine(model, X_train, y_train) - fit!(mach) - trees = XGB.trees(mach.fitresult[1]) - solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=true) - X_test_f32 = mapcols(col -> Float32.(col), X_test) - preds = apply(solem, X_test_f32) - predsl = CategoricalArrays.levelcode.(categorical(preds)) - - yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 - bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") - ŷ = XGB.predict(bst, X_test) - - @test (predsl .-1) == ŷ + XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for num_round in 10:10:50 + for eta in 0.1:0.1:0.6 + model = XGTrees(; num_round, eta, objective="multi:softmax") + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + trees = XGB.trees(mach.fitresult[1]) + encoding = get_encoding(mach.fitresult[2]) + classlabels = get_classlabels(encoding) + featurenames = mach.report.vals[1].features + solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) + X_test_f32 = mapcols(col -> Float32.(col), X_test) + preds = apply(solem, X_test_f32) + predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 + + yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 + bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") + xg_preds = XGB.predict(bst, X_test) + + if predsl != xg_preds + println("train_ratio: ", train_ratio) + println("seed: ", seed) + println("num_round: ", num_round) + println("eta: ", eta) + gino + end + @test predsl == xg_preds + end end end end From c948f27aa57c7e6f954b33427189a6924417d735 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Wed, 16 Apr 2025 23:09:22 +0200 Subject: [PATCH 27/44] xgboost apply! --- .github/workflows/ci.yml | 3 +- Project.toml | 2 +- src/SoleModels.jl | 1 - src/utils/models/ensembles.jl | 44 ++++++++++++--------------- test/XGBoostExt/xgboost_classifier.jl | 7 ++--- 5 files changed, 25 insertions(+), 32 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9cdf0cf..ba475a2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,4 +16,5 @@ jobs: - uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - slug: aclai-lab/SoleModels.jl \ No newline at end of file + slug: aclai-lab/SoleModels.jl + diff --git a/Project.toml b/Project.toml index fa6d55d..d5c217b 100644 --- a/Project.toml +++ b/Project.toml @@ -84,4 +84,4 @@ SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJDecisionTreeInterface", "SoleData"] +test = ["Test", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJDecisionTreeInterface", "SoleData", "CategoricalArrays"] diff --git a/src/SoleModels.jl b/src/SoleModels.jl index a3ae567..74a9782 100644 --- a/src/SoleModels.jl +++ b/src/SoleModels.jl @@ -61,7 +61,6 @@ export height export DecisionEnsemble, models export DecisionForest, trees export DecisionSet, rules, nrules - export DecisionXGBoost export MixedModel diff --git a/src/utils/models/ensembles.jl b/src/utils/models/ensembles.jl index 6738264..261823c 100644 --- a/src/utils/models/ensembles.jl +++ b/src/utils/models/ensembles.jl @@ -507,30 +507,24 @@ function apply( end # TODO parallelize -# function apply!( -# m::DecisionXGBoost, -# d::AbstractInterpretationSet, -# y::AbstractVector; -# mode = :replace, -# leavesonly = false, -# # show_progress = false, # length(ntrees(m)) > 15, -# suppress_parity_warning = false, -# kwargs... -# ) -# # @show y -# y = __apply_pre(m, d, y) -# # _d = SupportedLogiset(d) TODO? -# # @show y -# preds = hcat([apply!(subm, d, y; mode, leavesonly, kwargs...) for subm in models(m)]...) - -# preds = __apply_post(m, preds) - -# preds = [ -# weighted_aggregation(m)(preds[i,:]; suppress_parity_warning, kwargs...) -# for i in 1:size(preds,1) -# ] +function apply!( + m::DecisionXGBoost, + d::AbstractInterpretationSet, + y::AbstractVector; + mode::Symbol=:replace, + leavesonly::Bool=false, + suppress_parity_warning::Bool=true, + kwargs... +) + y = __apply_pre(m, d, y) -# preds = __apply_pre(m, d, preds) -# return __apply!(m, mode, preds, y, leavesonly) -# end + preds = hcat([apply_leaf_scores(subm, d; suppress_parity_warning, kwargs...) for subm in models(m)]...) + preds = __apply_post(m, preds) + preds = [ + scored_aggregation(m)(pred, sort(unique(m.info.supporting_labels))) + for pred in eachrow(preds) + ] + preds = __apply_pre(m, d, preds) + return __apply!(m, mode, preds, y, leavesonly) +end diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index d25b661..7df4871 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -59,10 +59,9 @@ X_test_f32 = mapcols(col -> Float32.(col), X_test) preds = apply(solem, X_test_f32) predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 -# TODO fix in rule-and-branch.jl -# preds2 = apply!(solem, X_test, y_test) - -# @test preds == preds2 +apply!(solem, X_test, y_test) +@test solem.info.supporting_predictions == preds +@test solem.info.supporting_labels == y_test # ---------------------------------------------------------------------------- # # julia XGBoost # From ef113800b632c06315c24e5624f2efb640bf77e0 Mon Sep 17 00:00:00 2001 From: Perro2110 Date: Fri, 18 Apr 2025 13:21:33 +0200 Subject: [PATCH 28/44] extractrules --> modalextractrules --- src/SoleModels.jl | 5 ++++- src/deprecate.jl | 2 +- src/rule-extraction.jl | 16 ++++++++-------- src/utils/models/other.jl | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/SoleModels.jl b/src/SoleModels.jl index 0fbd307..4b0164e 100644 --- a/src/SoleModels.jl +++ b/src/SoleModels.jl @@ -93,7 +93,10 @@ export subtreeheight include("symbolic-utils.jl") export PlainRuleExtractor -export modalextractrules, listrules, joinrules + + + +export extractrules, listrules, joinrules include("rule-extraction.jl") diff --git a/src/deprecate.jl b/src/deprecate.jl index a1f16d8..3f944f1 100644 --- a/src/deprecate.jl +++ b/src/deprecate.jl @@ -3,7 +3,7 @@ const MixedSymbolicModel = MixedModel const List = DecisionList const Tree = DecisionTree const Forest = DecisionForest - +const modalextractrules = extractrules; export modalextractrules @inline function apply( diff --git a/src/rule-extraction.jl b/src/rule-extraction.jl index d94fb05..50a86c0 100644 --- a/src/rule-extraction.jl +++ b/src/rule-extraction.jl @@ -6,7 +6,7 @@ An exact or heuristic logical method for extracting logical rule from symbolic m Refer to [SolePostHoc](https://github.com/aclai-lab/SolePostHoc.jl) for rule extraction methods. -See also [`modalextractrules`](@ref), [`Rule`](@ref)], [`issymbolicmodel`](@ref). +See also [`extractrules`](@ref), [`Rule`](@ref)], [`issymbolicmodel`](@ref). """ abstract type RuleExtractor end @@ -16,33 +16,33 @@ Return whether a rule extraction method is known to be exact (as opposed to heur isexact(::RuleExtractor) = false """ - modalextractrules(re::RuleExtractor, m, args...; kwargs...) + extractrules(re::RuleExtractor, m, args...; kwargs...) Extract rules from symbolic model `m`, using a rule extraction method `re`. """ -function modalextractrules(re::RuleExtractor, m, args...; kwargs...) - return error("Please, provide method modalextractrules(::$(typeof(m)), args...; kwargs...).") +function extractrules(re::RuleExtractor, m, args...; kwargs...) + return error("Please, provide method extractrules(::$(typeof(m)), args...; kwargs...).") end # Helpers function (RE::Type{<:RuleExtractor})(args...; kwargs...) - return modalextractrules(RE(), args...; kwargs...) + return extractrules(RE(), args...; kwargs...) end # Helpers function (re::RuleExtractor)(args...; kwargs...) - return modalextractrules(re, args...; kwargs...) + return extractrules(re, args...; kwargs...) end """ Plain extraction method involves listing one rule per each possible symbolic path within the model. -With this method, [`modalextractrules`](@ref) redirects to [`listrules`](@ref). +With this method, [`extractrules`](@ref) redirects to [`listrules`](@ref). See also [`listrules`](@ref), [`Rule`](@ref)], [`issymbolicmodel`](@ref). """ struct PlainRuleExtractor <: RuleExtractor end isexact(::PlainRuleExtractor) = true -function modalextractrules(::PlainRuleExtractor, m, args...; kwargs...) +function extractrules(::PlainRuleExtractor, m, args...; kwargs...) if haslistrules(m) listrules(m, args...; kwargs...) else diff --git a/src/utils/models/other.jl b/src/utils/models/other.jl index bb9280b..60f79f6 100644 --- a/src/utils/models/other.jl +++ b/src/utils/models/other.jl @@ -361,7 +361,7 @@ iscomplete(m::DecisionSet) = m.iscomplete isnonoverlapping(m::DecisionSet) = m.isnonoverlapping function listrules(m::DecisionSet) - isnonoverlapping || error("Cannot listrules from non-overlapping decision set. Try `modalextractrules` with heuristics, instead.") + isnonoverlapping || error("Cannot listrules from non-overlapping decision set. Try `extractrules` with heuristics, instead.") rules(m) end From d26e1980d97b8e6c13cd11e3e7574f3bc4bad6d4 Mon Sep 17 00:00:00 2001 From: Perro2110 Date: Fri, 18 Apr 2025 18:26:40 +0200 Subject: [PATCH 29/44] minor add in TODO --- TODO.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TODO.md b/TODO.md index e6e2383..04cfba6 100644 --- a/TODO.md +++ b/TODO.md @@ -51,6 +51,6 @@ Test: # Distribution of covered examples for consequent # Distribution of examples on which the rule was built ✔ Testing parser error @done(24-05-31 11:12) - + ☐ Add test for rule-extraction.jl Questions: - ✔ Readmetrics for CN2 statistics @done(24-05-31 11:12) \ No newline at end of file + ✔ Readmetrics for CN2 statistics @done(24-05-31 11:12) From 9638489e267cec7df5513ff07dcb40196fe55a24 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 19 Apr 2025 00:28:33 +0200 Subject: [PATCH 30/44] fix tests & deps --- Project.toml | 2 +- test/DecisionTreeExt/forest.jl | 32 +++++++++++++++++ test/DecisionTreeExt/tree.jl | 49 +++++++++++++++++++++++++++ test/XGBoostExt/xgboost_classifier.jl | 10 ++---- 4 files changed, 84 insertions(+), 9 deletions(-) diff --git a/Project.toml b/Project.toml index d5c217b..a9779fa 100644 --- a/Project.toml +++ b/Project.toml @@ -84,4 +84,4 @@ SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJDecisionTreeInterface", "SoleData", "CategoricalArrays"] +test = ["Test", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJModelInterface", "MLJDecisionTreeInterface", "SoleData"] diff --git a/test/DecisionTreeExt/forest.jl b/test/DecisionTreeExt/forest.jl index f82afd2..fa7dccc 100644 --- a/test/DecisionTreeExt/forest.jl +++ b/test/DecisionTreeExt/forest.jl @@ -62,3 +62,35 @@ accuracy = sum(preds .== y_test)/length(y_test) printmodel(solem; max_depth = 7, show_intermediate_finals = true, show_metrics = true) # @test_broken printmodel.(listrules(solem, min_lift = 1.0, min_ninstances = 0); show_metrics = true); + +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + Forest = MLJ.@load RandomForestClassifier pkg=DecisionTree + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for n_trees in 10:10:60 + # solemodel + model = Forest(; n_trees, rng=Xoshiro(seed)) + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + classlabels = (mach).fitresult[2][sortperm((mach).fitresult[3])] + featurenames = MLJ.report(mach).features + solem = solemodel(MLJ.fitted_params(mach).forest; classlabels, featurenames) + preds = apply!(solem, X_test, y_test) + + # decisiontree + rf_model = DT.build_forest(y_train, Matrix(X_train), -1, n_trees; rng=Xoshiro(seed)) + rf_preds = DT.apply_forest(rf_model, Matrix(X_test)) + + @test preds == rf_preds + end + end + end +end diff --git a/test/DecisionTreeExt/tree.jl b/test/DecisionTreeExt/tree.jl index a0c349a..c7c1311 100644 --- a/test/DecisionTreeExt/tree.jl +++ b/test/DecisionTreeExt/tree.jl @@ -7,6 +7,7 @@ using DataFrames using MLJDecisionTreeInterface using SoleModels using Random +using CategoricalArrays import DecisionTree as DT @@ -75,3 +76,51 @@ printmodel.(sort(interesting_rules, by = readmetrics); show_metrics = (; round_d @test length(joinrules(interesting_rules)) == 3 @test (natoms.((interesting_rules)) |> sum) == (natoms.(joinrules(interesting_rules)) |> sum) + +# ---------------------------------------------------------------------------- # +# Data Validation # +# ---------------------------------------------------------------------------- # +@testset "data validation" begin + Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for max_depth in 2:1:6 + # solemodel + model = Tree(; max_depth, rng=Xoshiro(seed)) + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + solem = solemodel(MLJ.fitted_params(mach).tree) + preds = apply!(solem, X_test, y_test) + + # decisiontree + dt_model = DT.build_tree(y_train, Matrix(X_train), 0, max_depth; rng=Xoshiro(seed)) + dt_preds = DT.apply_tree(dt_model, Matrix(X_test)) + + @test preds == dt_preds + end + end + end +end + +### the problem rises in fit! in MLJDecisionTreeInterface +Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree +seed = 1 +max_depth = 3 +train_ratio = 0.5 +train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) +X_train, y_train = X[train, :], y[train] +X_test, y_test = X[test, :], y[test] + +model = Tree(; max_depth, rng=Xoshiro(seed)) +mach = machine(model, X_train, y_train) +fit!(mach, verbosity=0) +solem = solemodel(MLJ.fitted_params(mach).tree) +preds = apply!(solem, X_test, y_test) + +dt_model = DT.build_tree(y_train, Matrix(X_train), 0, max_depth; rng=Xoshiro(seed)) +dt_preds = DT.apply_tree(dt_model, Matrix(X_test)) \ No newline at end of file diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index 7df4871..a885861 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -1,6 +1,7 @@ using Test using MLJ +using MLJ.CategoricalArrays: levelcode, categorical using MLJBase using DataFrames @@ -141,20 +142,13 @@ println("RandomForest accuracy: ", rm_accuracy) featurenames = mach.report.vals[1].features solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) X_test_f32 = mapcols(col -> Float32.(col), X_test) - preds = apply(solem, X_test_f32) + preds = apply!(solem, X_test_f32, y_test) predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") xg_preds = XGB.predict(bst, X_test) - if predsl != xg_preds - println("train_ratio: ", train_ratio) - println("seed: ", seed) - println("num_round: ", num_round) - println("eta: ", eta) - gino - end @test predsl == xg_preds end end From b2a48698b59819e8492fdb569dad9e2aa1ebcc07 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Wed, 23 Apr 2025 17:28:00 +0200 Subject: [PATCH 31/44] posthoc --- src/evaluate.jl | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/evaluate.jl b/src/evaluate.jl index a460498..cc459ef 100644 --- a/src/evaluate.jl +++ b/src/evaluate.jl @@ -322,8 +322,20 @@ function evaluaterule( classmask = (Y .== outcome(consequent(rule))) checkmask, explanations = begin if compute_explanations + + ### from Perry's SoleModels fix for SolePostHoc # Note: This is kind of quick and dirty. - disjs = SoleLogics.disjuncts(SoleLogics.LeftmostDisjunctiveForm(antecedent(rule))) + # disjs = SoleLogics.disjuncts(SoleLogics.LeftmostDisjunctiveForm(antecedent(rule))) + ante = antecedent(rule) + if (ante isa SyntaxBranch) + # Radice disgiuntiva: trasformiamo in forma “disjunctive” e poi estraiamo i disgiunti + dnf = SoleLogics.LeftmostDisjunctiveForm(ante) + disjs = SoleLogics.disjuncts(dnf) + else + # Non è un OR in radice → un singolo disgiunto + disjs = [ante] + end + checkmatrix = hcat([check(disj, X; kwargs...) for disj in disjs]...) # @show checkmatrix checkmask = map(any, eachrow(checkmatrix)) @@ -337,11 +349,22 @@ function evaluaterule( end pos_checkmask = checkmask[classmask] neg_checkmask = checkmask[(!).(classmask)] + + ### from Perry's SoleModels fix for SolePostHoc + # Controlli per array vuoti + sensitivity = length(pos_checkmask) > 0 ? sum(pos_checkmask)/length(pos_checkmask) : 0.0 + specificity = length(neg_checkmask) > 0 ? 1-(sum(neg_checkmask)/length(neg_checkmask)) : 1.0 + out = (; classmask = classmask, checkmask = checkmask, - sensitivity = sum(pos_checkmask)/length(pos_checkmask), - specificity = 1-(sum(neg_checkmask)/length(neg_checkmask)), + + ### from Perry's SoleModels fix for SolePostHoc + # sensitivity = sum(pos_checkmask)/length(pos_checkmask), + # specificity = 1-(sum(neg_checkmask)/length(neg_checkmask)), + sensitivity = sensitivity, + specificity = specificity, + explanations = explanations, ) return out From 1b5a6c633e93156d018dcd30bdc2c2c7ee4775c2 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sun, 27 Apr 2025 13:07:48 +0200 Subject: [PATCH 32/44] test fixed --- test/DecisionTreeExt/tree.jl | 68 ++++++++++++++---------------------- 1 file changed, 26 insertions(+), 42 deletions(-) diff --git a/test/DecisionTreeExt/tree.jl b/test/DecisionTreeExt/tree.jl index c7c1311..909eb31 100644 --- a/test/DecisionTreeExt/tree.jl +++ b/test/DecisionTreeExt/tree.jl @@ -81,46 +81,30 @@ printmodel.(sort(interesting_rules, by = readmetrics); show_metrics = (; round_d # Data Validation # # ---------------------------------------------------------------------------- # @testset "data validation" begin - Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree - - for train_ratio in 0.5:0.1:0.9 - for seed in 1:40 - train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) - X_train, y_train = X[train, :], y[train] - X_test, y_test = X[test, :], y[test] - - for max_depth in 2:1:6 - # solemodel - model = Tree(; max_depth, rng=Xoshiro(seed)) - mach = machine(model, X_train, y_train) - fit!(mach, verbosity=0) - solem = solemodel(MLJ.fitted_params(mach).tree) - preds = apply!(solem, X_test, y_test) - - # decisiontree - dt_model = DT.build_tree(y_train, Matrix(X_train), 0, max_depth; rng=Xoshiro(seed)) - dt_preds = DT.apply_tree(dt_model, Matrix(X_test)) - - @test preds == dt_preds - end - end - end + Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree + + for train_ratio in 0.5:0.1:0.9 + for seed in 1:40 + train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) + X_train, y_train = X[train, :], y[train] + X_test, y_test = X[test, :], y[test] + + for max_depth in 2:1:6 + # solemodel + model = Tree(; max_depth, rng=Xoshiro(seed)) + mach = machine(model, X_train, y_train) + fit!(mach, verbosity=0) + solem = solemodel(MLJ.fitted_params(mach).tree) + preds = apply!(solem, X_test, y_test) + + # decisiontree + y_coded_train = @. CategoricalArrays.levelcode(y_train) + dt_model = DT.build_tree(y_coded_train, Matrix(X_train), 0, max_depth; rng=Xoshiro(seed)) + dt_preds = DT.apply_tree(dt_model, Matrix(X_test)) + + preds_coded = CategoricalArrays.levelcode.(CategoricalArray(preds)) + @test preds_coded == dt_preds + end + end + end end - -### the problem rises in fit! in MLJDecisionTreeInterface -Tree = MLJ.@load DecisionTreeClassifier pkg=DecisionTree -seed = 1 -max_depth = 3 -train_ratio = 0.5 -train, test = partition(eachindex(y), train_ratio; shuffle=true, rng=Xoshiro(seed)) -X_train, y_train = X[train, :], y[train] -X_test, y_test = X[test, :], y[test] - -model = Tree(; max_depth, rng=Xoshiro(seed)) -mach = machine(model, X_train, y_train) -fit!(mach, verbosity=0) -solem = solemodel(MLJ.fitted_params(mach).tree) -preds = apply!(solem, X_test, y_test) - -dt_model = DT.build_tree(y_train, Matrix(X_train), 0, max_depth; rng=Xoshiro(seed)) -dt_preds = DT.apply_tree(dt_model, Matrix(X_test)) \ No newline at end of file From dbe8e83501c744233316957e44e67a844fdeba1a Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sun, 27 Apr 2025 17:59:19 +0200 Subject: [PATCH 33/44] fix extras deps --- Project.toml | 18 +- ext/MLJXGBoostInterfaceExt.jl | 308 ----------------------- test/XGBoostExt/xgboost_classifier.jl | 9 +- test/XGBoostExt/xgboost_predict_issue.jl | 156 ------------ 4 files changed, 12 insertions(+), 479 deletions(-) delete mode 100644 ext/MLJXGBoostInterfaceExt.jl delete mode 100644 test/XGBoostExt/xgboost_predict_issue.jl diff --git a/Project.toml b/Project.toml index 98fba71..999ae09 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,7 @@ name = "SoleModels" uuid = "4249d9c7-3290-4ddd-961c-e1d3ec2467f8" license = "MIT" -authors = [ - "Michele GHIOTTI", - "Giovanni PAGLIARINI", - "Edoardo PONSANESI", - "Eduard I. STAN", -] +authors = ["Michele GHIOTTI", "Giovanni PAGLIARINI", "Edoardo PONSANESI", "Eduard I. STAN"] version = "0.10.0" [deps] @@ -45,7 +40,6 @@ XGBoostExt = "XGBoost" [compat] AbstractTrees = "0.4" -BenchmarkTools = "1" CSV = "0.10" CategoricalArrays = "0.10" DataFrames = "1" @@ -74,8 +68,10 @@ ZipFile = "0.10" julia = "1" [extras] -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" +FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" @@ -83,10 +79,12 @@ MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba" -PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" +SoleLogics = "b002da8f-3cb3-4d91-bbe3-2953433912b5" +SoleModels = "4249d9c7-3290-4ddd-961c-e1d3ec2467f8" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9" [targets] -test = ["Test", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "BenchmarkTools", "MLJBase", "XGBoost", "DecisionTree", "MLJModelInterface", "MLJDecisionTreeInterface", "SoleData"] +test = ["Test", "CategoricalArrays", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "MLJBase", "XGBoost", "DecisionTree", "MLJModelInterface", "MLJDecisionTreeInterface", "SoleData", "SoleModels", "SoleLogics", "FunctionWrappers"] diff --git a/ext/MLJXGBoostInterfaceExt.jl b/ext/MLJXGBoostInterfaceExt.jl deleted file mode 100644 index caf64fe..0000000 --- a/ext/MLJXGBoostInterfaceExt.jl +++ /dev/null @@ -1,308 +0,0 @@ -module MLJXGBoostInterfaceExt - -import MLJModelInterface as MMI -import XGBoost as XGB -import Tables -using CategoricalArrays -using AbstractTrees - -import Sole: AbstractModel -import Sole: VariableValue, ScalarCondition, Atom, ConstantModel, Branch, DecisionTree - -const PKG = "MLJXGBoostInterface" - -abstract type XGBoostAbstractRegressor <: MMI.Deterministic end -abstract type XGBoostAbstractClassifier <: MMI.Probabilistic end - -const XGTypes = Union{XGBoostAbstractRegressor,XGBoostAbstractClassifier} - -struct TreePrinter{T} - tree::T - features::Vector{Symbol} -end -(c::TreePrinter)(depth) = AbstractTrees.print_tree(c.tree, depth, feature_names = c.features) -(c::TreePrinter)() = AbstractTrees.print_tree(c.tree, 5, feature_names = c.features) - -Base.show(stream::IO, c::TreePrinter) = - print(stream, "TreePrinter object (call with display depth)") - -function classes(y) - p = CategoricalArrays.pool(y) - [p[i] for i in 1:length(p)] -end - -# function modelexpr(name::Symbol, absname::Symbol, obj::AbstractString, objvalidate::Symbol) -function modelexpr(name::Symbol, absname::Symbol) - metric = absname == :XGBoostAbstractClassifier ? "mlogloss" : "rmse" - quote - MMI.@mlj_model mutable struct $name <: $absname - # MMI.@mlj_model mutable struct $name - # ref: https://xgboost.readthedocs.io/en/stable/parameter.html - # general parameters - booster::String = "gbtree" - # device::String = "cpu" - eval_metric::String = $metric - objective::Union{String, Nothing} = nothing - num_round::Int = 100::(_ ≥ 0) - early_stopping_rounds::Int = 0::(_ ≥ 0) - - - # parameters for tree booster - eta::Float64 = 0.3::(0.0 ≤ _ ≤ 1.0) - alpha::Float64 = 0::(_ ≥ 0) - gamma::Float64 = 0::(_ ≥ 0) - lambda::Float64 = 1::(_ ≥ 0) - - max_depth::Int = 6::(_ ≥ 0) - min_child_weight::Float64 = 1::(_ ≥ 0) - max_delta_step::Float64 = 0::(_ ≥ 0) - subsample::Float64 = 1::(0 < _ ≤ 1) - sampling_method::String = "uniform" - - colsample_bynode::Float64 = 1::(0 < _ ≤ 1) - colsample_bylevel::Float64 = 1::(0 < _ ≤ 1) - colsample_bytree::Float64 = 1::(0 < _ ≤ 1) - - tree_method::String = "auto" - - # scale_pos_weight::Float64 = 1.0 - end - - - # # additional parameters for dart booster - # one_drop::Union{Int,Bool} = 0::(0 ≤ _ ≤ 1) - # normalize_type::String = "tree" - # rate_drop::Float64 = 0::(0 ≤ _ ≤ 1) - # sample_type::String = "uniform" - # skip_drop::Float64 = 0::(0 ≤ _ ≤ 1) - - # # additional parameters for linear booster - # feature_selector::String = "cyclic" - # top_k::Int = 0::(_ ≥ 0) - - # # additional parameters for tweedie regression - # tweedie_variance_power::Float64 = 1.5::(1 < _ < 2) - - # # additional parameters for pseudo-huber - # # quantile_alpha TODO - - # # additional parameters for quantile loss - # # quantile_alpha TODO - - # # learning task parameters - # base_score::Float64 = 0.5 - - - # # test::Int = 1::(_ ≥ 0) - # # sketch_eps::Float64 = 0.03::(0 < _ < 1) - # # predictor::String = "cpu_predictor" - # # watchlist = nothing # if this is nothing we will not pass it so as to use default - # # importance_type::String = "gain" - # end - end -end - -# eval(modelexpr(:XGBoostClassifier, :XGBoostAbstractClassifier, "automatic", :validate_class_objective)) -# eval(modelexpr(:XGBoostCount, :XGBoostAbstractRegressor, "count:poisson", :validate_count_objective)) -# eval(modelexpr(:XGBoostRegressor, :XGBoostAbstractRegressor, "reg:squarederror", :validate_reg_objective)) - -eval(modelexpr(:XGBoostClassifier, :XGBoostAbstractClassifier)) -eval(modelexpr(:XGBoostCount, :XGBoostAbstractRegressor)) -eval(modelexpr(:XGBoostRegressor, :XGBoostAbstractRegressor)) - -MMI.reports_feature_importances(::Type{<:XGBoostAbstractRegressor}) = true -MMI.reports_feature_importances(::Type{<:XGBoostAbstractClassifier}) = true - -export XGBoostClassifier, XGBoostCount, XGBoostRegressor - -function MMI.fit( - m::XGBoostClassifier, - verbosity::Int, - X, - y, - features, - classes, - ) - - integers_seen = unique(y) - classes_seen = MMI.decoder(classes)(integers_seen) - - # dX = if isnothing(weight) - # XGB.DMatrix(X, y_code; feature_names=names(X)) - # # XGB.DMatrix(MMI.matrix(X), y_code) - # else - # XGB.DMatrix(X, y_code; feature_names=names(X), weight = weight) - # # XGB.DMatrix(MMI.matrix(X), y_code; feature_names=names(X), weight = weight) - # end - - # bst = xgboost(dm; kwargs(model, verbosity, objective)..., num_class...) - nclass = length(classes_seen) - if isnothing(m.objective) - m.objective = nclass == 2 ? "binary:logistic" : "multi:softprob" - end - - params = Dict((field, getfield(m, field)) for field in fieldnames(typeof(m))) - bst = XGB.xgboost((X, y.-1); verbosity=verbosity, params..., num_class=nclass) - - # imp = XGB.importancetable(bst) - ts = XGB.trees(bst) - - verbosity < 2 || AbstractTrees.print_tree(ts, m.max_depth) - - fitresult = (bst, classes_seen, integers_seen, features) - - cache = nothing - report = ( - classes_seen=nclass, - print_tree=TreePrinter(ts, features), - features=features, - ) - return fitresult, cache, report -end - -get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in classes(classes_seen)) -classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] - -struct InfoXGBNode - node::XGB.Node - info::NamedTuple -end -AbstractTrees.nodevalue(n::InfoXGBNode) = n.node - -struct InfoXGBLeaf - node::XGB.Node - info::NamedTuple -end -AbstractTrees.nodevalue(l::InfoXGBLeaf) = l.node - -# struct InfoNode{S,T} <: AbstractTrees.AbstractNode{DecisionTree.Node{S,T}} -# node::DecisionTree.Node{S,T} -# info::NamedTuple -# end -# AbstractTrees.nodevalue(n::InfoNode) = n.node - -# struct InfoLeaf{T} <: AbstractTrees.AbstractNode{DecisionTree.Leaf{T}} -# leaf::DecisionTree.Leaf{T} -# info::NamedTuple -# end -# AbstractTrees.nodevalue(l::InfoLeaf) = l.leaf - -isleaf(node::XGB.Node) = isempty(node.children) ? true : false - -wrap(vecnode::Vector{<:XGB.Node}, info::NamedTuple=NamedTuple()) = MLJXGBoostInterface.wrap.(vecnode, Ref(info)) -# wrap(tree::DecisionTree.Root, info::NamedTuple=NamedTuple()) = wrap(tree.node, info) -wrap(node::XGB.Node, info::NamedTuple=NamedTuple()) = isleaf(node) ? InfoXGBLeaf(node, info) : InfoXGBNode(node, info) -# wrap(leaf::DecisionTree.Leaf, info::NamedTuple=NamedTuple()) = InfoLeaf(leaf, info) - -function MMI.fitted_params(::XGBoostAbstractClassifier, fitresult) - raw_tree = XGB.trees(fitresult[1]) - encoding = get_encoding(fitresult[2]) - features = fitresult[4] - classlabels = MLJXGBoostInterface.classlabels(encoding) - info = (featurenames=features, classlabels) - tree = MLJXGBoostInterface.wrap(raw_tree, info,) - (; tree, raw_tree, encoding, features) -end - -function AbstractTrees.children(node::InfoXGBNode) - (wrap(node.children[1], node.info), wrap(node.children[2], node.info)) -end -AbstractTrees.children(node::InfoXGBLeaf) = () - -# to get column names based on table access type: -_columnnames(X) = _columnnames(X, Val(Tables.columnaccess(X))) |> collect -_columnnames(X, ::Val{true}) = Tables.columnnames(Tables.columns(X)) -_columnnames(X, ::Val{false}) = Tables.columnnames(first(Tables.rows(X))) - -MMI.reformat(::XGBoostAbstractClassifier, X, y) = - (XGB.DMatrix(X), MMI.int(y), _columnnames(X), classes(y)) -# MMI.reformat(::Regressor, X, y) = -# (Tables.matrix(X), float(y), _columnnames(X)) -# MMI.selectrows(::TreeModel, I, Xmatrix, y, meta...) = -# (view(Xmatrix, I, :), view(y, I), meta...) - -split2id(str::String) = parse(Int, filter(isdigit, str)) + 1 - -function solemodel( - tree::Vector{<:InfoXGBNode}, - raw_tree::Vector{<:XGB.Node}, - encoding::Dict, - features::Vector{Symbol}; - kwargs... -) - dt = DecisionTree[] - @show encoding - for (i, t) in enumerate(tree) - idx = (i - 1) % length(encoding) + 1 - push!(dt, MLJXGBoostInterface.solemodel(t; majority=encoding[idx], kwargs...)) - end - - return dt -end -function solemodel(tree::InfoXGBNode, keep_condensed = false; majority, use_featurenames = true, kwargs...) - # @show fieldnames(typeof(tree)) - use_featurenames = use_featurenames ? tree.info.featurenames : false - root, info = begin - if keep_condensed - root = MLJXGBoostInterface.solemodel(tree.node; majority=majority, use_featurenames = use_featurenames, kwargs...) - info = (; - apply_preprocess=(y -> UInt32(findfirst(x -> x == y, tree.info.classlabels))), - apply_postprocess=(y -> tree.info.classlabels[y]), - ) - root, info - else - root = MLJXGBoostInterface.solemodel(tree.node; majority=majority, replace_classlabels = tree.info.classlabels, use_featurenames = use_featurenames, kwargs...) - info = (;) - root, info - end - end - - info = merge(info, (; - featurenames=tree.info.featurenames, - # - supporting_predictions=root.info[:supporting_predictions], - supporting_labels=root.info[:supporting_labels], - ) - ) - - return DecisionTree(root, info) -end - -function solemodel(tree::XGB.Node; majority, replace_classlabels = nothing, use_featurenames = false) - if isempty(tree.children) - # leaf - prediction = majority.ref - # labels = tree.leaf - # if !isnothing(replace_classlabels) - # prediction = replace_classlabels[prediction] - # labels = replace_classlabels[labels] - # end - # info = (; - # supporting_predictions = fill(prediction, length(labels)), - # supporting_labels = labels, - # ) - ### TODO - labels = [1,1,1,1] - info = (; - supporting_predictions = fill(prediction, length(labels)), - supporting_labels = labels, - ) - return ConstantModel(prediction, info) - else - # node - test_operator = (<) - # @show fieldnames(typeof(tree)) - feature = (use_featurenames != false) ? VariableValue(use_featurenames[split2id(tree.split)]) : VariableValue(split2id(tree.split)) - cond = ScalarCondition(feature, test_operator, tree.split_condition) - antecedent = Atom(cond) - lefttree = MLJXGBoostInterface.solemodel(tree.children[1]; majority=majority, replace_classlabels=replace_classlabels, use_featurenames=use_featurenames) - righttree = MLJXGBoostInterface.solemodel(tree.children[2]; majority=majority, replace_classlabels=replace_classlabels, use_featurenames=use_featurenames) - info = (; - supporting_predictions = [lefttree.info[:supporting_predictions]..., righttree.info[:supporting_predictions]...], - supporting_labels = [lefttree.info[:supporting_labels]..., righttree.info[:supporting_labels]...], - ) - return Branch(antecedent, lefttree, righttree, info) - end -end - -end \ No newline at end of file diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index a885861..2685325 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -1,7 +1,6 @@ using Test using MLJ -using MLJ.CategoricalArrays: levelcode, categorical using MLJBase using DataFrames @@ -58,7 +57,7 @@ solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, ke # Make test instances flow into the model X_test_f32 = mapcols(col -> Float32.(col), X_test) preds = apply(solem, X_test_f32) -predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 +predsl = CategoricalArrays.levelcode.(CategoricalArrays.categorical(preds)) .- 1 apply!(solem, X_test, y_test) @test solem.info.supporting_predictions == preds @@ -67,7 +66,7 @@ apply!(solem, X_test, y_test) # ---------------------------------------------------------------------------- # # julia XGBoost # # ---------------------------------------------------------------------------- # -yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 +yl_train = CategoricalArrays.levelcode.(CategoricalArrays.categorical(y_train)) .- 1 # create and train a gradient boosted tree model of 5 trees bst = XGB.xgboost( (X_train, yl_train), @@ -143,9 +142,9 @@ println("RandomForest accuracy: ", rm_accuracy) solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) X_test_f32 = mapcols(col -> Float32.(col), X_test) preds = apply!(solem, X_test_f32, y_test) - predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 + predsl = CategoricalArrays.levelcode.(CategoricalArrays.categorical(preds)) .- 1 - yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 + yl_train = CategoricalArrays.levelcode.(CategoricalArrays.categorical(y_train)) .- 1 bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softmax") xg_preds = XGB.predict(bst, X_test) diff --git a/test/XGBoostExt/xgboost_predict_issue.jl b/test/XGBoostExt/xgboost_predict_issue.jl deleted file mode 100644 index 198093f..0000000 --- a/test/XGBoostExt/xgboost_predict_issue.jl +++ /dev/null @@ -1,156 +0,0 @@ -using MLJ -using DataFrames - -import MLJModelInterface as MMI -using SoleModels -import XGBoost as XGB -using CategoricalArrays -using Random - -# References: -# https://github.com/chengjunhou/xgb2sql/issues/1 -# https://xgboost.readthedocs.io/en/latest/R-package/xgboostfromJSON.html - -# per me -# https://xgboost.readthedocs.io/en/latest/build.html - -function predict_xgboost_bag(trees, X; n_classes=0, objective="binary:logistic") - n_samples = size(X, 1) - ntree_limit = length(trees) - n_classes == 0 && throw(ArgumentError("n_classes must be specified for multi-class predictions")) - - # Initialize predictions - if startswith(objective, "multi:softprob") || startswith(objective, "multi:softmax") - # For multi-class probabilities, we need a matrix - raw_preds = zeros(Float32, n_samples, n_classes) - else - # For binary and regression, a vector is sufficient - raw_preds = zeros(Float32, n_samples) - end - - # Iterate through trees and accumulate predictions - for i in 1:ntree_limit - tree = trees[i] - tree_preds = predict_tree(tree, X) - - if startswith(objective, "multi:softprob") || startswith(objective, "multi:softmax") - # For multi-class softprob, each tree outputs predictions for a specific class - class_idx = (i - 1) % n_classes + 1 - raw_preds[:, class_idx] .+= tree_preds - else - # For binary or regression, simply add the predictions - raw_preds .+= tree_preds - end - end - # Apply appropriate transformation based on objective - if objective == "binary:logistic" - # Apply sigmoid transformation - return 1.0 ./ (1.0 .+ exp.(-raw_preds)) - elseif objective == "multi:softprob" - # Apply softmax transformation - exp_preds = exp.(raw_preds) - row_sums = sum(exp_preds, dims=2) - return exp_preds ./ row_sums - elseif objective == "multi:softmax" - # Return class with highest score - if n_classes > 1 - _, indices = findmax(raw_preds, dims=2) - return [idx[2] for idx in indices] - else - return raw_preds .> 0 - end - elseif objective == "count:poisson" - # Apply exponential transformation for Poisson - return exp.(raw_preds) - else - # For regression or other objectives, return raw predictions - return raw_preds - end -end - -function predict_tree(tree, X) - n_samples = size(X, 1) - predictions = zeros(Float32, n_samples) - - for i in 1:n_samples - predictions[i] = traverse_tree(tree, X[i, :]) - end - return predictions -end - -function traverse_tree(tree, x) - # Start at root node - node = tree # Adjust based on your tree structure - - # Traverse until reaching a leaf - while !isempty(node.children) - # Get the split feature and value - feature_idx = node.split - split_value = Float32(node.split_condition) - - # Decide which child to go to - if x[feature_idx] < split_value - node = node.children[1] - else - node = node.children[2] - end - end - # Return the leaf value - return Float32(node.leaf) -end - -X, y = @load_iris -X = DataFrame(X) -train_ratio = 0.8 -seed, num_round, eta = 3, 1, 0.1 -rng = Xoshiro(seed) -train, test = partition(eachindex(y), train_ratio; shuffle=true, rng) -X_train, y_train = X[train, :], y[train] -X_test, y_test = X[test, :], y[test] - -XGTrees = MLJ.@load XGBoostClassifier pkg=XGBoost -model = XGTrees(; num_round, eta, objective="multi:softprob") -mach = machine(model, X_train, y_train) -fit!(mach) -# mlj_predict = predict(mach, DataFrame(X_test[27,:])) -mlj_predict = predict(mach, DataFrame(X_test[28,:])) - -trees = XGB.trees(mach.fitresult[1]) -get_encoding(classes_seen) = Dict(MMI.int(c) => c for c in MMI.classes(classes_seen)) -get_classlabels(encoding) = [string(encoding[i]) for i in sort(keys(encoding) |> collect)] -encoding = get_encoding(mach.fitresult[2]) -classlabels = get_classlabels(encoding) -featurenames = mach.report.vals[1].features - -solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=false) -preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # NOT WORKING -@test preds[1] == "versicolor" - -solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames, use_float32=true) -preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # WORKING -@test preds[1] == "virginica" - -solem = solemodel(trees, Matrix(X_train), y_train; classlabels, featurenames) -preds = apply(solem, DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), :auto)) # WORKING -@test preds[1] == "virginica" - -predsl = CategoricalArrays.levelcode.(categorical(preds)) .- 1 - -yl_train = CategoricalArrays.levelcode.(categorical(y_train)) .- 1 -bst = XGB.xgboost((X_train, yl_train); num_round, eta, num_class=3, objective="multi:softprob") -xtrs = XGB.trees(bst) -# yyy = XGB.predict(bst, DataFrame(X_test[27,:])) # WORKING -yyy = XGB.predict(bst, DataFrame(X_test[28,:])) # NOT WORKING - - -# # For multi-class classification -rename!(X_test, [:f0, :f1, :f2, :f3]) -# class_probs = predict_xgboost_bag(trees, DataFrame(X_test[27,:]); n_classes=3, objective="multi:softprob") # WORKING -class_probs = predict_xgboost_bag(trees, DataFrame(X_test[28,:]); n_classes=3, objective="multi:softprob") # NOT WORKING -class_preds = [argmax(probs) for probs in eachrow(class_probs)] .-1 - -X_train32 = DataFrame(Float32.(Matrix(X_train)), [:f0, :f1, :f2, :f3]) -bst32 = XGB.xgboost((X_train32, yl_train); num_round, eta, num_class=3, objective="multi:softprob") -xtrs32 = XGB.trees(bst32) -X_test32 = DataFrame(reshape(Float32.(Vector(X_test[28,:])), 1, :), [:f0, :f1, :f2, :f3]) -class_probs32 = predict_xgboost_bag(xtrs32, X_test32; n_classes=3, objective="multi:softprob") # NOT WORKING From 6c34e4df37c948d162fd0dcffae657219bd39c9e Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sun, 27 Apr 2025 18:24:19 +0200 Subject: [PATCH 34/44] cleaned unused packages, tests still working --- Project.toml | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/Project.toml b/Project.toml index 999ae09..131ef09 100644 --- a/Project.toml +++ b/Project.toml @@ -6,29 +6,18 @@ version = "0.10.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" -DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e" -Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" -HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" Lazy = "50d2b5c4-7a5e-59d5-8109-a42b560f39c0" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" -Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SoleBase = "4475fa32-7023-44a0-aa70-4813b230e492" SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" SoleLogics = "b002da8f-3cb3-4d91-bbe3-2953433912b5" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb" -Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" ThreadSafeDicts = "4239201d-c60e-5e0a-9702-85d713665ba7" -ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [weakdeps] DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" @@ -40,31 +29,22 @@ XGBoostExt = "XGBoost" [compat] AbstractTrees = "0.4" -CSV = "0.10" CategoricalArrays = "0.10" DataFrames = "1" -DataStructures = "0.18" DecisionTree = "0.12" FillArrays = "1" FunctionWrappers = "1" -Graphs = "1.8" -HTTP = "1.9" IterTools = "1" Lazy = "0.15.1" PrettyTables = "2.2" -ProgressMeter = "1" Random = "1" Reexport = "1" -Revise = "3" -SoleBase = "0.11 - 0.13" +SoleBase = "0.13.0" SoleData = "0.15, 0.16" SoleLogics = "0.11 - 0.13" StatsBase = "0.30 - 0.34" -Suppressor = "0.2" -Tables = "1" ThreadSafeDicts = "0.1" XGBoost = "2" -ZipFile = "0.10" julia = "1" [extras] From bcefa20fed26f9127a13c1c41e4915d282f65361 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sun, 27 Apr 2025 18:37:32 +0200 Subject: [PATCH 35/44] updated SoleData dep --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 131ef09..d75341c 100644 --- a/Project.toml +++ b/Project.toml @@ -40,8 +40,8 @@ PrettyTables = "2.2" Random = "1" Reexport = "1" SoleBase = "0.13.0" -SoleData = "0.15, 0.16" -SoleLogics = "0.11 - 0.13" +SoleData = "0.16.1" +SoleLogics = "0.13" StatsBase = "0.30 - 0.34" ThreadSafeDicts = "0.1" XGBoost = "2" From ba4d143f6e561e7284ca6572e87ca5287cb58b2b Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sun, 27 Apr 2025 21:51:51 +0200 Subject: [PATCH 36/44] updated solelogics dep --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d75341c..ddfa3e3 100644 --- a/Project.toml +++ b/Project.toml @@ -41,7 +41,7 @@ Random = "1" Reexport = "1" SoleBase = "0.13.0" SoleData = "0.16.1" -SoleLogics = "0.13" +SoleLogics = "0.13.1" StatsBase = "0.30 - 0.34" ThreadSafeDicts = "0.1" XGBoost = "2" From 72efb5499df12474d2cae0098d0e9ef8b52eb736 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Fri, 2 May 2025 11:49:20 +0200 Subject: [PATCH 37/44] added a @show for checking --- ext/DecisionTreeExt.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/ext/DecisionTreeExt.jl b/ext/DecisionTreeExt.jl index 1755fb5..041d887 100644 --- a/ext/DecisionTreeExt.jl +++ b/ext/DecisionTreeExt.jl @@ -57,6 +57,7 @@ function SoleModels.solemodel( keep_condensed = false, kwargs... ) where {T,orig_O} +@show "PASO" # TODO rewrite error according to orig_O # if isnothing(classlabels) # error("Please, provide classlabels argument, as in solemodel(forest; classlabels = classlabels, kwargs...). If your forest was trained via MLJ, use `classlabels = (mach).fitresult[2][sortperm((mach).fitresult[3])]`. Also consider providing `featurenames = report(mach).features`.") From bd889ac829212aae1c5d383edc4eafb5c735b28c Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Fri, 2 May 2025 12:28:42 +0200 Subject: [PATCH 38/44] removed guard --- ext/DecisionTreeExt.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/ext/DecisionTreeExt.jl b/ext/DecisionTreeExt.jl index 041d887..1755fb5 100644 --- a/ext/DecisionTreeExt.jl +++ b/ext/DecisionTreeExt.jl @@ -57,7 +57,6 @@ function SoleModels.solemodel( keep_condensed = false, kwargs... ) where {T,orig_O} -@show "PASO" # TODO rewrite error according to orig_O # if isnothing(classlabels) # error("Please, provide classlabels argument, as in solemodel(forest; classlabels = classlabels, kwargs...). If your forest was trained via MLJ, use `classlabels = (mach).fitresult[2][sortperm((mach).fitresult[3])]`. Also consider providing `featurenames = report(mach).features`.") From 83ffb33bf2fb42c61541c640fe8432fcfd166e26 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 7 Jun 2025 15:36:47 +0200 Subject: [PATCH 39/44] test working --- Project.toml | 34 ++++++++++++++------------- pluto-demo.jl | 10 ++++---- test/DecisionTreeExt/adaboost.jl | 13 ---------- test/DecisionTreeExt/forest.jl | 12 ---------- test/DecisionTreeExt/tree.jl | 13 ---------- test/XGBoostExt/xgboost_classifier.jl | 2 +- test/base.jl | 7 ------ test/juliacon2024.jl | 12 +++++----- test/linear-form-utilities.jl | 5 ---- test/misc.jl | 11 --------- test/parse.jl | 7 ------ test/runtests.jl | 30 +++++++++++++++++++---- test/test_tree.jl | 3 --- 13 files changed, 55 insertions(+), 104 deletions(-) diff --git a/Project.toml b/Project.toml index ddfa3e3..86ccd06 100644 --- a/Project.toml +++ b/Project.toml @@ -2,7 +2,7 @@ name = "SoleModels" uuid = "4249d9c7-3290-4ddd-961c-e1d3ec2467f8" license = "MIT" authors = ["Michele GHIOTTI", "Giovanni PAGLIARINI", "Edoardo PONSANESI", "Eduard I. STAN"] -version = "0.10.0" +version = "0.10.1" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" @@ -30,41 +30,43 @@ XGBoostExt = "XGBoost" [compat] AbstractTrees = "0.4" CategoricalArrays = "0.10" -DataFrames = "1" -DecisionTree = "0.12" FillArrays = "1" FunctionWrappers = "1" IterTools = "1" -Lazy = "0.15.1" +Lazy = "0.15" PrettyTables = "2.2" -Random = "1" Reexport = "1" -SoleBase = "0.13.0" -SoleData = "0.16.1" -SoleLogics = "0.13.1" +SoleBase = "0.13" +SoleData = "0.16" +SoleLogics = "0.13" StatsBase = "0.30 - 0.34" ThreadSafeDicts = "0.1" -XGBoost = "2" julia = "1" [extras] -CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" -FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" -MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SoleData = "123f1ae1-6307-4526-ab5b-aab3a92a2b8c" -SoleLogics = "b002da8f-3cb3-4d91-bbe3-2953433912b5" -SoleModels = "4249d9c7-3290-4ddd-961c-e1d3ec2467f8" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9" [targets] -test = ["Test", "CategoricalArrays", "DataFrames", "Random", "MLJ", "MultiData", "Markdown", "InteractiveUtils", "MLJBase", "XGBoost", "DecisionTree", "MLJModelInterface", "MLJDecisionTreeInterface", "SoleData", "SoleModels", "SoleLogics", "FunctionWrappers"] +test = [ + "DataFrames", + "DecisionTree", + "InteractiveUtils", + "MLJ", + "MLJDecisionTreeInterface", + "MLJModelInterface", + "Markdown", + "MultiData", + "Random", + "Test", + "XGBoost" +] diff --git a/pluto-demo.jl b/pluto-demo.jl index ce978a6..095231b 100644 --- a/pluto-demo.jl +++ b/pluto-demo.jl @@ -1,13 +1,13 @@ ### A Pluto.jl notebook ### # v0.19.38 -using Markdown -using InteractiveUtils +# using Markdown +# using InteractiveUtils # ╔═╡ 7685d19e-cc98-4031-a6f9-29ecccc9f417 begin - using SoleModels - using DataFrames + # using SoleModels + # using DataFrames # Load an example time-series classification dataset as a tuple (DataFrame, Vector{String}) X_df, y = SoleModels.load_arff_dataset("NATOPS") @@ -32,7 +32,7 @@ end # ╔═╡ 1ccda54b-1b70-4353-ace6-fe277e5bf67f begin - using MultiData + # using MultiData # Construct a logiset from a DataFrame logiset = scalarlogiset(X_df, features) diff --git a/test/DecisionTreeExt/adaboost.jl b/test/DecisionTreeExt/adaboost.jl index 1c2b921..445b8a7 100644 --- a/test/DecisionTreeExt/adaboost.jl +++ b/test/DecisionTreeExt/adaboost.jl @@ -1,16 +1,3 @@ -using Test - -using MLJ -using MLJBase -using DataFrames -using CategoricalArrays - -using MLJDecisionTreeInterface -using SoleModels -using Random - -import DecisionTree as DT - X, y = @load_iris X = DataFrame(X) diff --git a/test/DecisionTreeExt/forest.jl b/test/DecisionTreeExt/forest.jl index fa7dccc..d4577aa 100644 --- a/test/DecisionTreeExt/forest.jl +++ b/test/DecisionTreeExt/forest.jl @@ -1,15 +1,3 @@ -using Test - -using MLJ -using MLJBase -using DataFrames - -using MLJDecisionTreeInterface -using SoleModels -using Random - -import DecisionTree as DT - X, y = @load_iris X = DataFrame(X) diff --git a/test/DecisionTreeExt/tree.jl b/test/DecisionTreeExt/tree.jl index 909eb31..2d864a0 100644 --- a/test/DecisionTreeExt/tree.jl +++ b/test/DecisionTreeExt/tree.jl @@ -1,16 +1,3 @@ -using Test - -using MLJ -using MLJBase -using DataFrames - -using MLJDecisionTreeInterface -using SoleModels -using Random -using CategoricalArrays - -import DecisionTree as DT - X, y = @load_iris X = DataFrame(X) diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index 2685325..0d2c1b3 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -1,7 +1,7 @@ using Test using MLJ -using MLJBase +# using MLJBase using DataFrames using SoleModels diff --git a/test/base.jl b/test/base.jl index 5dd8767..f9c98dc 100644 --- a/test/base.jl +++ b/test/base.jl @@ -1,10 +1,3 @@ -using SoleModels -using SoleLogics -using FunctionWrappers: FunctionWrapper -using SoleModels: AbstractModel -using SoleModels: ConstantModel, LeafModel -using Test - # base.jl io = IOBuffer() diff --git a/test/juliacon2024.jl b/test/juliacon2024.jl index 93f135c..ab50e19 100644 --- a/test/juliacon2024.jl +++ b/test/juliacon2024.jl @@ -1,12 +1,12 @@ # JuliaCon2024 demo # Load packages -begin - using MLJ - using MLJDecisionTreeInterface - using DataFrames - using Random -end +# begin +# using MLJ +# using MLJDecisionTreeInterface +# using DataFrames +# using Random +# end # Load dataset X, y = begin diff --git a/test/linear-form-utilities.jl b/test/linear-form-utilities.jl index d55fcb2..09fbb9b 100644 --- a/test/linear-form-utilities.jl +++ b/test/linear-form-utilities.jl @@ -1,8 +1,3 @@ -using Test -using SoleLogics -using SoleModels - - b = Branch(LeftmostConjunctiveForm((@atoms p q r s)), "YES", "NO") @test_nowarn b[1:3] diff --git a/test/misc.jl b/test/misc.jl index b048a49..a4594a4 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -1,14 +1,3 @@ -# using Revise - -# using Reexport -using FunctionWrappers: FunctionWrapper -using Test -using SoleLogics -using SoleModels -using SoleModels: AbstractModel -using SoleModels: ConstantModel, LeafModel -using SoleModels: listrules, displaymodel, submodels - io = IOBuffer() parse_other_kind_of_formula = SoleLogics.parseformula diff --git a/test/parse.jl b/test/parse.jl index a4e905e..5d1a019 100644 --- a/test/parse.jl +++ b/test/parse.jl @@ -1,10 +1,3 @@ -using Test -using SoleModels -using SoleData -using SoleData: AbstractUnivariateFeature, Feature -using SoleData: ScalarCondition -using SoleData: feature - ############################################################################################ ############################ Orange parser ################################################# ############################################################################################ diff --git a/test/runtests.jl b/test/runtests.jl index 986db9e..f78c9ae 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,8 +1,28 @@ -# using Revise -using SoleModels -using SoleLogics -using Test -using Random +using Distributed +addprocs(2) + +@everywhere begin + using SoleModels + using SoleModels: AbstractModel + using SoleModels: ConstantModel, LeafModel + using SoleModels: listrules, displaymodel, submodels + using SoleData + using SoleData: AbstractUnivariateFeature, Feature + using SoleData: ScalarCondition + using SoleData: feature + using SoleLogics + using CategoricalArrays + using Markdown + using MultiData + using InteractiveUtils + using MLJ + using MLJDecisionTreeInterface + import DecisionTree as DT + using DataFrames + using Test + using Random + using FunctionWrappers: FunctionWrapper +end function run_tests(list) println("\n" * ("#"^50)) diff --git a/test/test_tree.jl b/test/test_tree.jl index af4cb1b..38c8450 100644 --- a/test/test_tree.jl +++ b/test/test_tree.jl @@ -8,9 +8,6 @@ # "yes" "no" "yes" "no" ################################################## -using SoleLogics -using SoleModels - formula_p = SoleLogics.parseformula("p") formula_q = SoleLogics.parseformula("q") formula_r = SoleLogics.parseformula("r") From 54ae95f557e1835c1039f7b21f5e18f36ed859b3 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 7 Jun 2025 15:46:05 +0200 Subject: [PATCH 40/44] updated github ci --- .cirrus.yml | 20 --------------- .github/dependabot.yml | 7 ++++++ .github/workflows/TagBot.yml | 16 ++++++++++++ .github/workflows/ci.yml | 49 ++++++++++++++++++++++++++++-------- 4 files changed, 61 insertions(+), 31 deletions(-) delete mode 100644 .cirrus.yml create mode 100644 .github/dependabot.yml diff --git a/.cirrus.yml b/.cirrus.yml deleted file mode 100644 index 013f726..0000000 --- a/.cirrus.yml +++ /dev/null @@ -1,20 +0,0 @@ -freebsd_instance: - image_family: freebsd-14-2 -task: - name: FreeBSD - artifacts_cache: - folder: ~/.julia/artifacts - env: - matrix: - - JULIA_VERSION: 1.9 - - JULIA_VERSION: 1 - - JULIA_VERSION: nightly - allow_failures: $JULIA_VERSION == 'nightly' - install_script: - - sh -c "$(fetch https://raw.githubusercontent.com/ararslan/CirrusCI.jl/master/bin/install.sh -o -)" - build_script: - - cirrusjl build - test_script: - - cirrusjl test - coverage_script: - - cirrusjl coverage codecov diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..700707c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index f49313b..0cd3114 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -4,6 +4,22 @@ on: types: - created workflow_dispatch: + inputs: + lookback: + default: "3" +permissions: + actions: read + checks: read + contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba475a2..d12f804 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,20 +1,47 @@ -name: Upload coverage reports to Codecov -on: [push, pull_request] +name: CI +on: + push: + branches: + - lumen-refactoring + tags: ['*'] + pull_request: + workflow_dispatch: +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: - run: - runs-on: ubuntu-latest + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} + runs-on: ${{ matrix.os }} + timeout-minutes: 60 + permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created + actions: write + contents: read + strategy: + fail-fast: false + matrix: + version: + - '1' + - 'lts' + - 'pre' + os: + - ubuntu-latest + arch: + - x64 steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Set up Julia 1.9.0 - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: - version: "1.9.0" + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v5 with: + files: lcov.info token: ${{ secrets.CODECOV_TOKEN }} - slug: aclai-lab/SoleModels.jl - + fail_ci_if_error: false \ No newline at end of file From 4275b3d5af9a9c941d87d8863ef0361b286d86cd Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 7 Jun 2025 15:54:30 +0200 Subject: [PATCH 41/44] added distributed package --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index 86ccd06..e70222d 100644 --- a/Project.toml +++ b/Project.toml @@ -46,6 +46,7 @@ julia = "1" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" @@ -60,6 +61,7 @@ XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9" test = [ "DataFrames", "DecisionTree", + "Distributed", "InteractiveUtils", "MLJ", "MLJDecisionTreeInterface", From 42c052ebcee8d31b6ec2585c21521f479b387648 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 7 Jun 2025 16:05:36 +0200 Subject: [PATCH 42/44] added MLJXGBoostInterface package --- Project.toml | 2 ++ test/XGBoostExt/xgboost_classifier.jl | 13 ------------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/Project.toml b/Project.toml index e70222d..c927db9 100644 --- a/Project.toml +++ b/Project.toml @@ -51,6 +51,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" +MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" MultiData = "8cc5100c-b3d1-4f82-90cb-0ea93d317aba" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -66,6 +67,7 @@ test = [ "MLJ", "MLJDecisionTreeInterface", "MLJModelInterface", + "MLJXGBoostInterface", "Markdown", "MultiData", "Random", diff --git a/test/XGBoostExt/xgboost_classifier.jl b/test/XGBoostExt/xgboost_classifier.jl index 0d2c1b3..ad04f0a 100644 --- a/test/XGBoostExt/xgboost_classifier.jl +++ b/test/XGBoostExt/xgboost_classifier.jl @@ -1,16 +1,3 @@ -using Test - -using MLJ -# using MLJBase -using DataFrames - -using SoleModels - -import MLJModelInterface as MMI -import XGBoost as XGB - -using Random, CategoricalArrays - X, y = @load_iris X = DataFrame(X) From 1218cb12ca668b9c2c76cd8cbc95dad6ca097622 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 7 Jun 2025 16:16:39 +0200 Subject: [PATCH 43/44] fix XGB --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index f78c9ae..4897596 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,7 @@ addprocs(2) using MLJ using MLJDecisionTreeInterface import DecisionTree as DT + import XGBoost as XGB using DataFrames using Test using Random From 37daad695e48c31254e96b101ecebe1c49caa63c Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Sat, 7 Jun 2025 16:33:47 +0200 Subject: [PATCH 44/44] added MMI --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index 4897596..a9b6a96 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,7 @@ addprocs(2) using MLJ using MLJDecisionTreeInterface import DecisionTree as DT + import MLJModelInterface as MMI import XGBoost as XGB using DataFrames using Test